#' Pick power to fit network to a scale-free topology
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param net_type Network type. One of 'signed', 'signed hybrid' or
#' 'unsigned'. Default is signed.
#' @param rsquared R squared cutoff. Default is 0.8.
#' @param cor_method Correlation method. One of "pearson", "biweight"
#' or "spearman". Default is "spearman".
#'
#' @return A list containing: \itemize{
#' \item{power}{Optimal power based on scale-free topology fit}
#' \item{plot}{A ggplot object displaying main statistics of the SFT fit test}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @seealso
#' \code{\link[WGCNA]{pickSoftThreshold}}
#' @rdname SFT_fit
#' @export
#' @importFrom WGCNA pickSoftThreshold
#' @importFrom ggplot2 theme element_text geom_hline
#' @examples
#' data(filt.se)
#' sft <- SFT_fit(filt.se, cor_method = "pearson")
SFT_fit <- function(exp, net_type = "signed", rsquared = 0.8,
cor_method = "spearman") {
exp <- handleSE(exp)
texp <- t(exp)
if(cor_method == "pearson") {
sft <- WGCNA::pickSoftThreshold(
texp, networkType = net_type, powerVector = 3:20,
RsquaredCut = rsquared
)
} else if(cor_method == "biweight") {
sft <- WGCNA::pickSoftThreshold(
texp, networkType = net_type, powerVector = 3:20,
RsquaredCut = rsquared, corFnc = bicor,
corOptions = list(use = 'p', maxPOutliers = 0.05)
)
} else if (cor_method == "spearman"){
sft <- WGCNA::pickSoftThreshold(
texp, networkType = net_type, powerVector=3:20,
RsquaredCut = rsquared,
corOptions = list(use = 'p', method = "spearman")
)
} else {
stop("Please, specify a correlation method (one of 'spearman', 'pearson' or 'biweight').")
}
wgcna_power <- sft$powerEstimate
if(is.na(wgcna_power)){
message("No power reached R-squared cut-off, now choosing max R-squared based power")
wgcna_power <- sft$fitIndices$Power[which(sft$fitIndices$SFT.R.sq == max(sft$fitIndices$SFT.R.sq))]
}
# Create data frame with indices to plot
sft_df <- data.frame(
power = sft$fitIndices$Power,
fit = -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq,
meank = sft$fitIndices$mean.k.
)
sft_df <- sft_df[sft_df$fit > 0, ]
# Plot 1
p1 <- ggplot(sft_df, aes(x = .data$power, y = .data$fit)) +
geom_point(color = "gray10") +
ggrepel::geom_text_repel(aes(label = .data$power)) +
labs(
x = "Soft threshold (power)",
y = expression(paste("Scale-free topology fit - ", R^{2})),
title = "Scale independence"
) +
theme_bw() +
ylim(c(0,1)) +
geom_hline(yintercept = rsquared, color = "brown3") +
theme(legend.position = "none")
# Plot 2
p2 <- ggplot(sft_df, aes(x = .data$power, y = .data$meank)) +
geom_point(color = "gray10") +
ggrepel::geom_text_repel(aes(label = .data$power)) +
labs(
x = "Soft threshold (power)",
y = "Mean connectivity (k)",
title = "Mean connectivity"
) +
theme_bw()
# Combined plot
sft_plot <- patchwork::wrap_plots(p1, p2, ncol = 2)
result <- list(power = as.numeric(wgcna_power), plot = sft_plot)
return(result)
}
#' Infer gene coexpression network from gene expression
#'
#' @param exp Either a `SummarizedExperiment` object, or a gene expression
#' matrix/data frame with genes in row names and samples in column names.
#' @param net_type Character indicating the type of network to infer.
#' One of 'signed', 'signed hybrid' or 'unsigned'. Default: 'signed'.
#' @param module_merging_threshold Numeric indicating the minimum correlation
#' threshold to merge similar modules into a single one. Default: 0.8.
#' @param SFTpower Numeric scalar indicating the value of the \eqn{\beta}
#' power to which correlation coefficients will be raised to ensure
#' scale-free topology fit. This value can be obtained with
#' the function \code{SFT_fit()}.
#' @param cor_method Character with correlation method to use.
#' One of "pearson", "biweight" or "spearman". Default: "spearman".
#' @param TOM_type Character specifying the method to use to calculate a
#' topological overlap matrix (TOM). If NULL, TOM type will be automatically
#' inferred from network type specified in \strong{net_type}. Default: NULL.
#' @param min_module_size Numeric indicating the minimum module size.
#' Default: 30.
#' @param return_cormat Logical indicating whether the correlation matrix
#' should be returned. If TRUE (default), an element named `correlation_matrix`
#' containing the correlation matrix will be included in the result list.
#' @param verbose Logical indicating whether to display progress
#' messages or not. Default: FALSE.
#'
#' @return A list containing the following elements: \itemize{
#' \item \emph{adjacency_matrix} Numeric matrix with network adjacencies.
#' \item \emph{MEs} Data frame of module eigengenes, with samples
#' in rows, and module eigengenes in columns.
#' \item \emph{genes_and_modules} Data frame with columns 'Genes' (character)
#' and 'Modules' (character) indicating the genes and the modules to
#' which they belong.
#' \item \emph{kIN} Data frame of degree centrality for each gene,
#' with columns 'kTotal' (total degree), 'kWithin' (intramodular degree),
#' 'kOut' (extra-modular degree), and 'kDiff' (difference between
#' the intra- and extra-modular degree).
#' \item \emph{correlation_matrix} Numeric matrix with pairwise
#' correlation coefficients between genes.
#' If parameter \strong{return_cormat} is FALSE, this will be NULL.
#' \item \emph{params} List with network inference parameters passed
#' as input.
#' \item \emph{dendro_plot_objects} List with objects to plot the dendrogram
#' in \code{plot_dendro_and_colors}. Elements are named 'tree' (an hclust
#' object with gene dendrogram), 'Unmerged' (character with per-gene module
#' assignments before merging similar modules), and 'Merged' (character
#' with per-gene module assignments after merging similar modules).
#' }
#' @author Fabricio Almeida-Silva
#' @rdname exp2gcn
#' @export
#' @importFrom WGCNA TOMsimilarity standardColors labels2colors
#' moduleEigengenes mergeCloseModules intramodularConnectivity
#' @importFrom dynamicTreeCut cutreeDynamicTree
#' @importFrom stats as.dist median cor fisher.test hclust na.omit prcomp
#' qnorm qqplot quantile sd var
#' @importFrom grDevices colorRampPalette
#' @examples
#' data(filt.se)
#' # The SFT fit was previously calculated and the optimal power was 16
#' gcn <- exp2gcn(exp = filt.se, SFTpower = 18, cor_method = "pearson")
exp2gcn <- function(
exp, net_type = "signed", module_merging_threshold = 0.8,
SFTpower = NULL, cor_method = "spearman", TOM_type = NULL,
min_module_size = 30,
return_cormat = TRUE, verbose = FALSE
) {
params <- list(
net_type = net_type,
module_merging_threshold = module_merging_threshold,
SFTpower = SFTpower,
cor_method = cor_method
)
exp <- handleSE(exp)
if(is.null(SFTpower)) { stop("Please, specify the SFT power.") }
# Calculate adjacency matrix
if(verbose) { message("Calculating adjacency matrix...") }
cor_matrix <- NULL
if(return_cormat) {
cor_matrix <- exp2cor(exp, cor_method = cor_method)
adj_matrix <- cor2adj(cor_matrix, beta = SFTpower, net_type = net_type)
} else {
adj_matrix <- exp2cor(exp, cor_method = cor_method) |>
cor2adj(beta = SFTpower, net_type = net_type)
}
# Calculate TOM from adjacency matrix
if(verbose) { message("Calculating topological overlap matrix (TOM)...") }
if(is.null(TOM_type)) { TOM_type <- guess_tom(net_type) }
TOM <- WGCNA::TOMsimilarity(adj_matrix, TOMType = TOM_type)
# Hierarchically cluster genes
geneTree <- hclust(as.dist(1 - TOM), method = "average")
geneTree$height <- round(geneTree$height, 7)
# Detect coexpression modules
if(verbose) { message("Detecting coexpression modules...") }
original_mods <- dynamicTreeCut::cutreeDynamicTree(
dendro = geneTree, minModuleSize = min_module_size, deepSplit = TRUE
)
nmod <- length(unique(original_mods))
palette <- rev(WGCNA::standardColors(nmod))
original_colors <- WGCNA::labels2colors(original_mods, colorSeq = palette)
# Calculate module eigengenes and pairwise distances between them
if(verbose) { message("Calculating module eigengenes (MEs)...") }
me_list <- WGCNA::moduleEigengenes(
t(exp), colors = original_colors, softPower = SFTpower
)
me <- me_list$eigengenes
original_metree <- hclust(
as.dist(1 - cor(me, method = "spearman")), method = "average"
)
# Merge very similar modules
if(verbose) { message("Merging similar modules...") }
merged <- merge_modules(
exp, original_colors, me, palette,
dissimilarity = 1 - module_merging_threshold, cor_method = cor_method
)
new_colors <- merged$colors
new_mes <- merged$newMEs
# Get data frame with genes and modules they belong to
genes_modules <- data.frame(Genes = rownames(exp), Modules = new_colors)
if(verbose) { message("Calculating intramodular connectivity...") }
kwithin <- WGCNA::intramodularConnectivity(adj_matrix, new_colors)
result_list <- list(
adjacency_matrix = adj_matrix,
MEs = new_mes,
genes_and_modules = genes_modules,
kIN = kwithin,
correlation_matrix = cor_matrix,
params = params,
dendro_plot_objects = list(
tree = geneTree, Unmerged = original_colors, Merged = new_colors
)
)
return(result_list)
}
#' Infer gene coexpression network from gene expression in a blockwise manner
#'
#' @param exp Either a `SummarizedExperiment` object, or a gene expression
#' matrix/data frame with genes in row names and samples in column names.
#' @param net_type Character indicating the type of network to infer.
#' One of 'signed', 'signed hybrid' or 'unsigned'. Default: 'signed'.
#' @param module_merging_threshold Numeric indicating the minimum correlation
#' threshold to merge similar modules into a single one. Default: 0.8.
#' @param SFTpower Numeric scalar indicating the value of the \eqn{\beta}
#' power to which correlation coefficients will be raised to ensure
#' scale-free topology fit. This value can be obtained with
#' the function \code{SFT_fit()}.
#' @param cor_method Character with correlation method to use.
#' One of "pearson" or "biweight". Default: "pearson".
#' @param TOM_type Character specifying the method to use to calculate a
#' topological overlap matrix (TOM). If NULL, TOM type will be automatically
#' inferred from network type specified in \strong{net_type}. Default: NULL.
#' @param max_block_size Numeric indicating the maximum block size for module
#' detection.
#' @param min_module_size Numeric indicating the minimum module size.
#' Default: 30.
#' @param ... Additional arguments to \code{WGCNA::blockwiseModules()}.
#'
#' @return A list containing the following elements: \itemize{
#' \item \emph{MEs} Data frame of module eigengenes, with samples
#' in rows, and module eigengenes in columns.
#' \item \emph{genes_and_modules} Data frame with columns 'Genes' (character)
#' and 'Modules' (character) indicating the genes and the modules to
#' which they belong.
#' \item \emph{params} List with network inference parameters passed
#' as input.
#' \item \emph{dendro_plot_objects} List with objects to plot the dendrogram
#' in \code{plot_dendro_and_colors}. Elements are named 'tree' (an hclust
#' object with gene dendrogram), 'Unmerged' (character with per-gene module
#' assignments before merging similar modules), and 'Merged' (character
#' with per-gene module assignments after merging similar modules).
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname exp2gcn_blockwise
#' @export
#' @importFrom WGCNA blockwiseModules
#' @examples
#' data(filt.se)
#' # The SFT fit was previously calculated and the optimal power was 16
#' cor <- WGCNA::cor
#' gcn <- exp2gcn_blockwise(
#' exp = filt.se, SFTpower = 18, cor_method = "pearson"
#' )
exp2gcn_blockwise <- function(
exp, net_type = "signed", module_merging_threshold = 0.8,
SFTpower = NULL, cor_method = "pearson", TOM_type = NULL,
max_block_size = 5000, min_module_size = 30, ...
) {
params <- list(
net_type = net_type,
module_merging_threshold = module_merging_threshold,
SFTpower = SFTpower,
cor_method = cor_method
)
exp <- handleSE(exp)
if(is.null(SFTpower)) { stop("Please, specify the SFT power.") }
cortype <- "pearson"
if(cor_method == "biweight") {
cortype <- "bicor"
} else if(!cor_method %in% c("pearson", "biweight")) {
stop("Argument to `cor_method` must be one of 'pearson' or 'biweight'.")
}
if(is.null(TOM_type)) { TOM_type <- guess_tom(net_type) }
# Detect modules
cor <- WGCNA::cor
bmod <- WGCNA::blockwiseModules(
datExpr = as.matrix(t(exp)),
checkMissingData = FALSE,
maxBlockSize = max_block_size,
corType = cortype,
maxPOutliers = 0.1,
power = SFTpower,
networkType = net_type,
TOMType = TOM_type,
minModuleSize = min_module_size,
mergeCutHeight = 1 - module_merging_threshold,
...
)
# Create result list
genes_modules <- data.frame(
Genes = names(bmod$colors), Modules = bmod$colors, row.names = NULL
)
result_list <- list(
MEs = bmod$MEs,
genes_and_modules = genes_modules,
params = params,
dendro_plot_objects = list(
tree = bmod$dendrograms[[1]],
Unmerged = bmod$unmergedColors,
Merged = bmod$colors
)
)
return(result_list)
}
#' Plot eigengene network
#'
#' @param gcn List object returned by \code{exp2gcn}.
#' @param palette Character indicating the name of the RColorBrewer palette
#' to use. Default: "PRGn".
#'
#' @return A base plot with the eigengene network
#' @importFrom stats hclust
#' @importFrom ComplexHeatmap pheatmap
#' @export
#' @rdname plot_eigengene_network
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' plot_eigengene_network(gcn)
plot_eigengene_network <- function(gcn, palette = "PRGn") {
# Calculate pairwise correlations between eigengenes and get hclust object
mes <- gcn$MEs[, !names(gcn$MEs) %in% "MEgrey"]
me_correlation <- cor(mes)
clusters <- hclust(as.dist(1 - me_correlation), method = "average")
# Create column metadata and row metadata
coldata <- data.frame(row.names = names(mes), Module = names(mes))
rowdata <- data.frame(row.names = names(mes), Module = names(mes))
colors <- list(
Module = setNames(gsub("ME", "", names(mes)), names(mes))
)
# Plot heatmap with distances
pal <- colorRampPalette(brewer.pal(n = 7, name = palette))(100)
hm <- ComplexHeatmap::pheatmap(
me_correlation, name = "Correlation",
color = pal,
cluster_cols = clusters,
cluster_rows = clusters,
main = "Pairwise correlations between module eigengenes",
legend_breaks = seq(-1, 1, 0.5),
breaks = seq(-1, 1, 0.02),
annotation_row = rowdata,
annotation_col = coldata,
legend = TRUE,
annotation_colors = colors
)
# Remove legend for row and column annotation
hm@top_annotation@anno_list$Module@show_legend <- FALSE
hm@left_annotation@anno_list$Module@show_legend <- FALSE
return(hm)
}
#' Plot dendrogram of genes and modules
#'
#' @param gcn List object returned by \code{exp2gcn}.
#'
#' @return A base plot with the gene dendrogram and modules.
#' @importFrom ggdendro dendro_data
#' @importFrom ggplot2 geom_segment aes theme_minimal labs scale_x_continuous
#' labs theme coord_cartesian element_blank geom_tile scale_fill_identity
#' scale_y_discrete geom_line element_rect
#' @importFrom patchwork wrap_plots
#' @export
#' @rdname plot_dendro_and_colors
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' plot_dendro_and_colors(gcn)
plot_dendro_and_colors <- function(gcn) {
# Create a dendrogram with ggplot2
## Get coordinates as a data frame
ddata <- ggdendro::dendro_data(
gcn$dendro_plot_objects$tree, type = "rectangle"
)$segments
## To make sure lines in the y-axis do not extend to 0 and get too long
ddata$yend[ddata$yend < 0.05] <- ddata$y[ddata$yend < 0.1] - 0.02
## Plot
p_dendro <- ggplot(ddata) +
geom_segment(aes(
x = .data$x, y = .data$y, xend = .data$xend, yend = .data$yend
), linewidth = 0.3) +
coord_cartesian(ylim = c(0, 1)) +
theme_minimal() +
labs(x = "", y = "Height", title = "Dendrogram of genes and modules") +
scale_x_continuous(expand = c(0, 0)) +
theme(
panel.grid = element_blank(),
axis.text.x = element_blank()
)
# Create a heatmap
## Heatmap data frame: 3 columns named `col`, `class`, and `value`
o <- gcn$dendro_plot_objects$tree$order
heatmap_df <- lapply(seq_along(gcn$dendro_plot_objects)[-1], function(x) {
modules <- gcn$dendro_plot_objects[[x]][o]
setname <- names(gcn$dendro_plot_objects)[x]
df <- data.frame(
col = factor(modules, levels = unique(modules)),
class = setname,
value = seq_along(modules)
)
return(df)
})
heatmap_df <- Reduce(rbind, heatmap_df)
## Create a data with coordinates for lines between rows of the heatmap
nrows <- length(unique(heatmap_df$class))
ngenes <- length(unique(heatmap_df$value))
line_df <- data.frame(
x = c(0, ngenes) + 0.5,
y = rep(2:nrows, each = 2) - 0.5
)
## Plot
p_hm <- ggplot(heatmap_df) +
geom_tile(aes(x = .data$value, y = .data$class, fill = .data$col)) +
scale_fill_identity() +
theme_minimal() +
labs(x = "", y = "") +
theme(
panel.background = element_rect(fill = "black"),
axis.text.x = element_blank(),
panel.grid = element_blank()
) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_discrete(expand = c(0.2, 0.2)) +
geom_line(data = line_df, aes(x = .data$x, y = .data$y, group = .data$y))
## Combine plots
heights <- c(3, 1)
if(length(gcn$dendro_plot_objects) > 3) { heights <- c(2, 1) }
p_combined <- wrap_plots(p_dendro, p_hm, nrow = 2, heights = heights)
return(p_combined)
}
#' Perform module stability analysis
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param net List object returned by \code{exp2gcn}.
#' @param nRuns Number of times to resample. Default is 20.
#'
#' @return A base plot with the module stability results.
#' @seealso
#' \code{\link[WGCNA]{sampledBlockwiseModules}}
#' @rdname module_stability
#' @export
#' @importFrom WGCNA sampledBlockwiseModules matchLabels
#' @examples
#' data(filt.se)
#' filt <- filt.se[1:100, ] # reducing even further for testing purposes
#' # The SFT fit was previously calculated and the optimal power was 16
#' gcn <- exp2gcn(filt, SFTpower = 16, cor_method = "pearson")
#' # For simplicity, only 2 runs
#' module_stability(exp = filt, net = gcn, nRuns = 2)
module_stability <- function(exp, net, nRuns = 20) {
norm.exp <- handleSE(exp)
expr <- as.matrix(t(norm.exp))
cor_method <- net$params$cor_method
if(cor_method == "biweight") { cor_method <- "bicor" }
# Infer modules for original and resampled data n times (n = nRuns)
mods0 <- WGCNA::sampledBlockwiseModules(
nRuns = nRuns,
replace = FALSE,
datExpr = expr,
maxBlockSize = 5000,
checkSoftPower = FALSE,
corType = cor_method,
networkType = net$params$net_type,
TOMType = guess_tom(net$params$net_type),
TOMDenom = "mean",
mergeCutHeight = 1 - net$params$module_merging_threshold,
reassignThreshold = 0,
numericLabels = FALSE,
checkMissingData = FALSE,
quickCor = 0, verbose = 2
)
nGenes <- ncol(expr)
# Create a matrix of labels for the original and all resampling runs
labels <- matrix(0, nGenes, nRuns + 1)
labels[, 1] <- mods0[[1]]$mods$colors
## Relabel modules in each of the resampling runs
labs <- Reduce(cbind, lapply(2:(nRuns+1), function(x) {
labels[, x] <- WGCNA::matchLabels(mods0[[x-1]]$mods$colors, labels[, 1])
}))
labels <- cbind(labels[,1], labs)
colnames(labels) <- c("Original", paste0("Resampling ", seq_len(nRuns)))
## Convert matrix to list of vectors
label_list <- as.list(as.data.frame(labels))
# Simulate the output of `exp2gcn()` for plotting
sim_gcn <- list(
dendro_plot_objects = c(
list(tree = mods0[[1]]$mods$dendrograms[[1]]), label_list
)
)
# Plot dendrogram of genes and modules in each run
p <- plot_dendro_and_colors(sim_gcn)
return(p)
}
#' Correlate module eigengenes to trait
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param metadata A data frame containing sample names in row names and
#' sample annotation in the first column. Ignored if `exp` is
#' a `SummarizedExperiment` object, since the function will extract colData.
#' @param MEs Module eigengenes. It is the 2nd element of the result list
#' generated by the function \code{exp2gcn}.
#' @param metadata_cols A vector (either numeric or character) indicating
#' which columns should be extracted from column metadata if \strong{exp}
#' is a `SummarizedExperiment` object. The vector can contain column
#' indices (numeric) or column names (character). By default, all columns are
#' used.
#' @param cor_method Method to calculate correlation. One of 'pearson',
#' 'spearman' or 'kendall'. Default is 'spearman'.
#'
#' @return A data frame with correlation and correlation p-values for each pair
#' of ME and trait, with the following variables:
#' \describe{
#' \item{ME}{Factor, module eigengene.}
#' \item{trait}{Factor, trait name. Each trait corresponds to a variable
#' of the sample metadata (if numeric) or levels of a variable
#' (if categorical).}
#' \item{cor}{Numeric, correlation.}
#' \item{pvalue}{Numeric, correlation P-values.}
#' \item{group}{Character, name of the metadata variable.}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname module_trait_cor
#' @export
#' @importFrom WGCNA corPvalueStudent
#' @importFrom reshape2 melt
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' module_trait_cor(filt.se, MEs = gcn$MEs)
module_trait_cor <- function(exp, metadata, MEs, metadata_cols = NULL,
cor_method = "pearson") {
if(is(exp, "SummarizedExperiment")) {
metadata <- se2metadata(exp, coldata_cols = metadata_cols)$coldata
}
exp <- handleSE(exp)
metadata <- metadata[colnames(exp), , drop = FALSE]
# Get data frame of module-trait correlations and P-values for each variable
cor_df <- Reduce(rbind, lapply(seq_along(metadata), function(x) {
## Get matrices of correlations, P-values, and P-value symbols
trait <- get_model_matrix(metadata, column_idx = x)
cor_matrix <- cor(as.matrix(MEs), trait, use = "p", method = cor_method)
pvalues <- WGCNA::corPvalueStudent(cor_matrix, nSamples = ncol(exp))
## Reshape to long format and merge data frames into one
v <- c("ME", "trait")
cor_long <- reshape2::melt(cor_matrix, value.name = "cor", varnames = v)
p_long <- reshape2::melt(pvalues, value.name = "pvalue", varnames = v)
final_df <- merge(cor_long, p_long)
final_df$group <- names(metadata)[x]
return(final_df)
}))
return(cor_df)
}
#' Plot a heatmap of module-trait correlations
#'
#' @param corandp A data frame of module-trait correlations as returned
#' by \code{module_trait_cor()}.
#' @param palette Character indicating which RColorBrewer palette to use.
#' Default: 'RdYlBu'.
#' @param transpose Logical indicating whether to transpose the heatmap
#' or not.
#'
#' @details Significance levels:
#' 1 asterisk: significant at alpha = 0.05.
#' 2 asterisks: significant at alpha = 0.01.
#' 3 asterisks: significant at alpha = 0.001.
#' no asterisk: not significant.
#'
#' @return A `Heatmap` object created by \code{ComplexHeatmap::pheatmap()}.
#'
#' @export
#' @rdname plot_module_trait_cor
#' @importFrom reshape2 dcast
#' @importFrom RColorBrewer brewer.pal
#' @importFrom ComplexHeatmap pheatmap
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' corandp <- module_trait_cor(filt.se, MEs = gcn$MEs)
#' plot_module_trait_cor(corandp)
plot_module_trait_cor <- function(corandp, palette = "RdYlBu",
transpose = FALSE) {
# Get a wide matrix of correlations
cormat <- reshape2::dcast(corandp, ME ~ trait, value.var = "cor")
rownames(cormat) <- cormat$ME
cormat$ME <- NULL
cormat <- as.matrix(cormat)
cormat[is.na(cormat)] <- 0
# Create a matrix of correlations and P-value symbols to display
## Get a matrix of P-values and convert values to symbols
pmat <- reshape2::dcast(corandp, ME ~ trait, value.var = "pvalue")
rownames(pmat) <- pmat$ME
pmat$ME <- NULL
pmat <- pval2symbol(as.matrix(pmat))
pmat[is.na(pmat)] <- ""
## Get a matrices of correlations and P-values
textmat <- paste(signif(cormat, 2), pmat, sep = "")
dim(textmat) <- dim(cormat)
# Get row and column annotation to decorate the heatmap
rowdata <- data.frame(row.names = rownames(cormat), Module = rownames(cormat))
coldata <- corandp[!duplicated(corandp$trait), ]
coldata <- data.frame(row.names = coldata$trait, Trait = coldata$group)
# Create a list of named vectors with colors to use in the heatmap
cols <- list(
Module = setNames(gsub("ME", "", rownames(cormat)), rownames(cormat)),
Trait = setNames(
custom_palette(1)[seq_along(unique(corandp$group))],
unique(corandp$group)
)
)
# Plot heatmap
pal <- colorRampPalette(rev(RColorBrewer::brewer.pal(10, palette)))(100)
hm <- ComplexHeatmap::pheatmap(
mat = if(transpose) t(cormat) else cormat,
name = "Correlation",
color = pal,
display_numbers = if(transpose) t(textmat) else textmat,
main = "Module-trait correlations",
legend_breaks = seq(-1, 1, 0.5),
breaks = seq(-1, 1, 0.02),
annotation_row = if(transpose) coldata else rowdata,
annotation_col = if(transpose) rowdata else coldata,
annotation_colors = cols,
row_split = if(transpose) coldata$Trait else NULL,
column_split = if(transpose) NULL else coldata$Trait
)
# Remove legend for module colors
if(transpose) {
hm@top_annotation@anno_list$Module@show_legend <- FALSE
} else {
hm@left_annotation@anno_list$Module@show_legend <- FALSE
}
return(hm)
}
#' Calculate gene significance for a given group of genes
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param metadata A data frame containing sample names in row names and
#' sample annotation in the first column. Ignored if `exp` is
#' a `SummarizedExperiment` object, since the function will extract colData.
#' @param metadata_cols A vector (either numeric or character) indicating
#' which columns should be extracted from column metadata if \strong{exp}
#' is a `SummarizedExperiment` object. The vector can contain column
#' indices (numeric) or column names (character). By default, all columns are
#' used.
#' @param genes Character vector of genes to be correlated with traits.
#' If not given, all genes in `exp` will be considered.
#' @param alpha Significance level. Default is 0.05.
#' @param cor_method Method to calculate correlation. One of 'pearson',
#' 'spearman' or 'kendall'. Default is 'spearman'.
#' @param min_cor Minimum correlation coefficient. Default is 0.2.
#' @param use_abs Logical indicating whether to filter by correlation using
#' absolute value or not. If TRUE, a \code{min_cor} of say 0.2 would keep all
#' correlations above 0.2 and below -0.2. Default is TRUE.
#'
#' @return A data frame with correlation and correlation p-values for each pair
#' of gene and trait, with the following variables:
#' \describe{
#' \item{gene}{Factor, gene ID.}
#' \item{trait}{Factor, trait name. Each trait corresponds to a variable
#' of the sample metadata (if numeric) or levels of a variable
#' (if categorical).}
#' \item{cor}{Numeric, correlation.}
#' \item{pvalue}{Numeric, correlation P-values.}
#' \item{group}{Character, name of the metadata variable.}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname gene_significance
#' @export
#' @importFrom WGCNA corPvalueStudent
#' @importFrom reshape2 melt
#' @examples
#' data(filt.se)
#' gs <- gene_significance(filt.se)
gene_significance <- function(
exp, metadata, metadata_cols = NULL, genes = NULL, alpha = 0.05,
cor_method = "pearson", min_cor = 0.2, use_abs = TRUE
) {
if(is(exp, "SummarizedExperiment")) {
metadata <- se2metadata(exp, coldata_cols = metadata_cols)$coldata
}
# Make sure samples in expression matrix and sample metadata match
exp <- handleSE(exp)
metadata <- metadata[colnames(exp), , drop = FALSE]
final_exp <- exp[, rownames(metadata)]
# Use only a subset of genes?
if(!is.null(genes)) {
final_exp <- final_exp[genes, , drop = FALSE]
}
# Get data frame of gene-trait correlations and P-values for each variable
cor_df <- Reduce(rbind, lapply(seq_along(metadata), function(x) {
## Get matrices of correlations, P-values, and P-value symbols
trait <- get_model_matrix(metadata, column_idx = x)
cor_matrix <- cor(
as.matrix(t(exp)), trait, use = "p", method = cor_method
)
pvalues <- WGCNA::corPvalueStudent(cor_matrix, nSamples = ncol(exp))
## Reshape to long format and merge data frames into one
v <- c("gene", "trait")
cor_long <- reshape2::melt(cor_matrix, value.name = "cor", varnames = v)
p_long <- reshape2::melt(pvalues, value.name = "pvalue", varnames = v)
final_df <- merge(cor_long, p_long)
final_df$group <- names(metadata)[x]
return(final_df)
}))
# Filter by correlation and P-value
if(use_abs) {
cor_df <- cor_df[cor_df$pvalue < alpha & abs(cor_df$cor) >= min_cor, ]
} else {
cor_df <- cor_df[cor_df$pvalue < alpha & cor_df$cor >= min_cor, ]
}
return(cor_df)
}
#' Plot a heatmap of gene significance
#'
#' @param corandp A data frame of gene-trait correlations as returned
#' by \code{gene_significance()}.
#' @param palette Character indicating which RColorBrewer palette to use.
#' Default: 'RdYlBu'.
#' @param transpose Logical indicating whether to transpose the heatmap
#' or not.
#' @param ... Additional arguments to \code{ComplexHeatmap::pheatmap()}.
#'
#' @details Significance levels:
#' 1 asterisk: significant at alpha = 0.05.
#' 2 asterisks: significant at alpha = 0.01.
#' 3 asterisks: significant at alpha = 0.001.
#' no asterisk: not significant.
#'
#' @return A `Heatmap` object created by \code{ComplexHeatmap::pheatmap()}.
#'
#' @export
#' @rdname plot_gene_significance
#' @importFrom reshape2 dcast
#' @importFrom RColorBrewer brewer.pal
#' @importFrom ComplexHeatmap pheatmap
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' corandp <- gene_significance(filt.se)
#' plot_gene_significance(corandp, show_rownames = FALSE)
plot_gene_significance <- function(
corandp, palette = "RdYlBu", transpose = FALSE, ...
) {
# Get a wide matrix of correlations
cormat <- reshape2::dcast(corandp, gene ~ trait, value.var = "cor")
rownames(cormat) <- cormat$gene
cormat$gene <- NULL
cormat <- as.matrix(cormat)
cormat[is.na(cormat)] <- 0
# Create a matrix of correlations and P-value symbols to display
## Get a matrix of P-values and convert values to symbols
pmat <- reshape2::dcast(corandp, gene ~ trait, value.var = "pvalue")
rownames(pmat) <- pmat$gene
pmat$gene <- NULL
pmat <- pval2symbol(as.matrix(pmat))
pmat[is.na(pmat)] <- ""
## Get a matrices of correlations and P-values
textmat <- paste(signif(cormat, 2), pmat, sep = "")
dim(textmat) <- dim(cormat)
# Get column annotation to decorate the heatmap
coldata <- corandp[!duplicated(corandp$trait), ]
coldata <- data.frame(row.names = coldata$trait, Trait = coldata$group)
coldata <- coldata[colnames(cormat), , drop = FALSE]
# Create a list of named vectors with colors to use in the heatmap
cols <- list(
Trait = setNames(
custom_palette(1)[seq_along(unique(corandp$group))],
unique(corandp$group)
)
)
# Plot heatmap
pal <- colorRampPalette(rev(RColorBrewer::brewer.pal(10, palette)))(100)
hm <- ComplexHeatmap::pheatmap(
mat = if(transpose) t(cormat) else cormat,
name = "Correlation",
color = pal,
display_numbers = FALSE,
main = "Gene-trait correlations",
legend_breaks = seq(-1, 1, 0.5),
breaks = seq(-1, 1, 0.02),
annotation_row = if(transpose) coldata else NA,
annotation_col = if(transpose) NA else coldata,
annotation_colors = cols,
row_split = if(transpose) coldata$Trait else NULL,
column_split = if(transpose) NULL else coldata$Trait,
...
)
return(hm)
}
#' Get GCN hubs
#'
#' @param exp A gene expression data frame with genes in row names and
#' samples in column names or a `SummarizedExperiment` object.
#' @param net List object returned by \code{exp2gcn}.
#'
#' @return Data frame containing gene IDs, modules and
#' intramodular connectivity of all hubs.
#' @author Fabricio Almeida-Silva
#' @seealso
#' \code{\link[WGCNA]{signedKME}}
#' @rdname get_hubs_gcn
#' @export
#' @importFrom WGCNA signedKME
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' hubs <- get_hubs_gcn(filt.se, gcn)
get_hubs_gcn <- function(exp, net) {
exp <- handleSE(exp)
cor_method <- net$params$cor_method
genes_modules <- net$genes_and_modules
MEs <- net$MEs
kIN <- net$kIN[, 2, drop=FALSE]
# Add kIN info
genes_modulesk <- merge(
genes_modules, kIN, by.x = "Genes", by.y = "row.names"
)
# Calculate kME
if(cor_method == "spearman") {
MM <- WGCNA::signedKME(
t(exp), MEs, outputColumnName = "",
corOptions = "use = 'p', method = 'spearman'"
)
} else if(cor_method == "pearson") {
MM <- WGCNA::signedKME(t(exp), MEs, outputColumnName = "")
} else if(cor_method == "biweight") {
MM <- WGCNA::signedKME(
t(exp), MEs, corFnc = "bicor", outputColumnName = "",
corOptions = "maxPOutliers = 0.1"
)
}
# Add kME info
kme_info <- merge(genes_modulesk, MM, by.x="Genes", by.y="row.names")
# Keep top 10% genes with the highest degree for each module
genes_mod_list <- split(kme_info, kme_info[,2])
genes_mod_list$grey <- NULL
top_10 <- lapply(genes_mod_list, function(x) {
indices <- round(nrow(x) * 0.1)
return(x[order(x$kWithin, decreasing=TRUE), ][seq_len(indices), ])
})
# Pick genes from the top 10% degree with kME above 0.8
hubs <- Reduce(rbind, lapply(top_10, function(x) {
cols <- c("Genes", "Modules", "kWithin", unique(x$Modules))
h <- x[, cols]
h <- h[h[,4] > 0.8, c(1,2,3)]
}))
rownames(hubs) <- seq_len(nrow(hubs))
colnames(hubs) <- c("Gene", "Module", "kWithin")
return(hubs)
}
#' Perform overrepresentation analysis for a set of genes
#'
#' @param genes Character vector containing genes for which
#' overrepresentation will be tested.
#' @param genesets Named list of character vectors, where list names
#' represent functional categories (e.g., GO, pathway, etc.), and vectors
#' represent their associated genes.
#' @param universe Character vector of genes to be used as universe.
#' @param adj Character indicating the multiple testing correction method
#' as in \code{stats::p.adjust()}.
#'
#' @return A data frame of overrepresentation results with the following
#' variables:
#' \describe{
#' \item{term}{character, functional term ID/name.}
#' \item{genes}{numeric, intersection length between input genes and genes
#' in a particular functional term.}
#' \item{all}{numeric, number of all genes in a particular functional term.}
#' \item{pval}{numeric, P-value for the hypergeometric test.}
#' \item{padj}{numeric, P-value adjusted for multiple comparisons using
#' the method specified in parameter \strong{adj}.}
#' }
#'
#' @importFrom stats p.adjust phyper
#' @noRd
ora <- function(
genes, genesets, universe, adj = "BH"
) {
# Remove genes that are not in `universe`
genes <- intersect(genes, universe)
gene_sets <- lapply(genesets, function(x) intersect(x, universe))
# Define universe and input genes
n_universe <- length(universe)
n_genes <- length(genes)
# Define variables of the hypergeometric test
x <- vapply(gene_sets, function(x) length(intersect(x, genes)), numeric(1))
m <- vapply(gene_sets, length, numeric(1))
n <- n_universe - m
k <- n_genes
# Get P-values from a hypergeometric test
pvals <- phyper(x - 1, m, n, k, lower.tail = FALSE)
# Store results in a data frame
results <- data.frame(
term = names(pvals),
genes = x,
all = m,
pval = pvals,
row.names = NULL
)
results$padj <- p.adjust(results$pval, method = adj)
return(results)
}
#' Perform overrepresentation analysis for a set of genes
#'
#' @param genes Character vector containing genes for overrepresentation
#' analysis.
#' @param background_genes Character vector of genes to be used as background
#' for the overrepresentation analysis.
#' @param annotation Annotation data frame with genes in the first column and
#' functional annotation in the other columns. This data frame can be exported
#' from Biomart or similar databases.
#' @param column Column or columns of \strong{annotation} to be used for
#' enrichment.
#' Both character or numeric values with column indices can be used.
#' If users want to supply more than one column, input a character or
#' numeric vector. Default: all columns from \strong{annotation}.
#' @param correction Multiple testing correction method. One of "holm",
#' "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none".
#' Default is "BH".
#' @param p P-value threshold. P-values below this threshold will be
#' considered significant. Default: 0.05.
#' @param min_setsize Numeric indicating the minimum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 10.
#' @param max_setsize Numeric indicating the maximum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 500.
#' @param bp_param BiocParallel back-end to be used.
#' Default: BiocParallel::SerialParam()
#'
#' @return A data frame of overrepresentation results with the following
#' variables:
#' \describe{
#' \item{term}{character, functional term ID/name.}
#' \item{genes}{numeric, intersection length between input genes and genes
#' in a particular functional term.}
#' \item{all}{numeric, number of all genes in a particular functional term.}
#' \item{pval}{numeric, P-value for the hypergeometric test.}
#' \item{padj}{numeric, P-value adjusted for multiple comparisons using
#' the method specified in parameter \strong{adj}.}
#' \item{category}{character, name of the grouping variable (i.e., column
#' name of \strong{annotation}).}
#' }
#'
#' @author Fabricio Almeida-Silva
#' @rdname enrichment_analysis
#' @export
#' @importFrom BiocParallel bplapply
#' @examples
#' \donttest{
#' data(filt.se)
#' data(zma.interpro)
#' genes <- rownames(filt.se)[1:50]
#' background_genes <- rownames(filt.se)
#' annotation <- zma.interpro
#' # Using p = 1 to show all results
#' enrich <- enrichment_analysis(genes, background_genes, annotation, p = 1)
#' }
enrichment_analysis <- function(
genes, background_genes, annotation, column = NULL,
correction = "BH", p = 0.05, min_setsize = 10, max_setsize = 500,
bp_param = BiocParallel::SerialParam()
) {
names(annotation)[1] <- "Gene"
# Filtered `annotation` data frame to keep only specified columns
col <- names(annotation)
if(!is.null(column)) {
col <- if(is.numeric(column)) c(1, column) else c("Gene", column)
}
fannot <- annotation[, col]
# Get a data frame of enrichment results for each column
enrichment <- bplapply(seq_along(fannot)[-1], function(x) {
## Remove missing values (unannotated genes for a particular category)
fannot[, x][is.na(fannot[, x])] <- "unannotated"
## Split variable into a named list
gene_sets <- split(fannot[, 1], fannot[, x])
gene_sets <- gene_sets[!duplicated(gene_sets)]
gene_sets <- gene_sets[names(gene_sets) != "unannotated"]
## Keep only element with `min_setsize` <= n <= `max_setsize` elements
set_sizes <- as.numeric(lengths(gene_sets))
keep <- which(set_sizes >= min_setsize & set_sizes <= max_setsize)
gene_sets <- gene_sets[keep]
## Get ORA results and add `category` column
ora_df <- ora(genes, gene_sets, background_genes, correction)
if(nrow(ora_df) > 0) {
ora_df$category <- names(fannot)[x]
## Filter non-significant observations out
ora_df <- ora_df[ora_df$padj <= p, ]
}
}, BPPARAM = bp_param)
enrichment <- Reduce(rbind, enrichment)
return(enrichment)
}
#' Perform enrichment analysis for coexpression network modules
#'
#' @param net List object returned by \code{exp2gcn}.
#' @param background_genes Character vector of genes to be used as background
#' for the Fisher's Exact Test.
#' @param annotation Annotation data frame with genes in the first column
#' and functional annotation in the other columns. This data frame can
#' be exported from Biomart or similar databases.
#' @param column Column or columns of \code{annotation} to be used for enrichment.
#' Both character or numeric values with column indices can be used.
#' If users want to supply more than one column, input a character or
#' numeric vector. Default: all columns from \code{annotation}.
#' @param correction Multiple testing correction method. One of "holm",
#' "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none".
#' Default is "BH".
#' @param p P-value threshold. P-values below this threshold will be considered
#' significant. Default is 0.05.
#' @param min_setsize Numeric indicating the minimum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 10.
#' @param max_setsize Numeric indicating the maximum gene set size to be
#' considered. Gene sets correspond to levels of each variable
#' in \strong{annotation}). Default: 500.
#' @param bp_param BiocParallel back-end to be used.
#' Default: BiocParallel::SerialParam()
#'
#' @return A data frame of overrepresentation results with the following
#' variables:
#' \describe{
#' \item{term}{character, functional term ID/name.}
#' \item{genes}{numeric, intersection length between input genes and genes
#' in a particular functional term.}
#' \item{all}{numeric, number of all genes in a particular functional term.}
#' \item{pval}{numeric, P-value for the hypergeometric test.}
#' \item{padj}{numeric, P-value adjusted for multiple comparisons using
#' the method specified in parameter \strong{adj}.}
#' \item{category}{character, name of the grouping variable (i.e., column
#' name of \strong{annotation}).}
#' \item{module}{character, module name.}
#' }
#'
#' @author Fabricio Almeida-Silva
#'
#' @rdname module_enrichment
#' @importFrom BiocParallel bplapply SerialParam
#' @export
#' @examples
#' \donttest{
#' data(filt.se)
#' data(zma.interpro)
#' background <- rownames(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' mod_enrich <- module_enrichment(gcn, background, zma.interpro, p=1)
#' }
module_enrichment <- function(
net = NULL, background_genes, annotation, column = NULL,
correction = "BH", p = 0.05, min_setsize = 10, max_setsize = 500,
bp_param = BiocParallel::SerialParam()
) {
# Create a list of data frames with columns `Genes` and `Modules`
genes.modules <- net$genes_and_modules
list.gmodules <- split(genes.modules, genes.modules$Modules)
list.gmodules <- list.gmodules[names(list.gmodules) != "grey"]
enrichment_all <- bplapply(seq_along(list.gmodules), function(x) {
module <- names(list.gmodules)[x]
message("Enrichment analysis for module ", module, "...")
# Perform enrichment analysis for {module}
l <- enrichment_analysis(
genes = as.character(list.gmodules[[x]][, 1]),
background_genes = background_genes,
annotation = annotation,
column = column,
correction = correction, p = p,
min_setsize = min_setsize, max_setsize = max_setsize
)
# Either add a column `module` or set `l` to NULL
if(!is.null(l)) {
if(nrow(l) > 0) {
l$module <- module
} else {
l <- NULL
}
}
return(l)
}, BPPARAM = bp_param)
enrichment_all <- Reduce(rbind, enrichment_all)
return(enrichment_all)
}
#' Get 1st-order neighbors of a given gene or group of genes
#'
#' @param genes Character vector containing genes from which
#' direct neighbors will be extracted.
#' @param net List object returned by \code{exp2gcn}.
#' @param cor_threshold Correlation threshold to filter connections.
#' As a weighted network is a fully connected graph, a cutoff must be selected.
#' Default is 0.7.
#'
#' @return List containing 1st-order neighbors for each input gene.
#' @seealso \code{exp2gcn} \code{SFT_fit}
#' @author Fabricio Almeida-Silva
#' @export
#' @rdname get_neighbors
#' @examples
#' data(filt.se)
#' genes <- rownames(filt.se)[1:10]
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' neighbors <- get_neighbors(genes, gcn)
get_neighbors <- function(genes, net, cor_threshold = 0.7) {
net_type <- net$params$net_type
power <- net$params$SFTpower
edges <- net$correlation_matrix
edges <- cormat_to_edgelist(edges)
colnames(edges) <- c("Gene1", "Gene2", "Weight")
filt_edges <- edges[edges$Gene1 %in% genes | edges$Gene2 %in% genes, ]
if(net_type == "signed") {
filt_edges <- filt_edges[abs(filt_edges$Weight) >= cor_threshold, ]
} else {
filt_edges <- filt_edges[filt_edges$Weight >= cor_threshold, ]
}
result_list <- lapply(genes, function(x) {
y <- filt_edges[rowSums(filt_edges == x) > 0, ]
y <- c(as.character(y$Gene1), as.character(y$Gene2))
y <- unique(y[y != x])
return(y)
})
names(result_list) <- genes
return(result_list)
}
#' Get edge list from an adjacency matrix for a group of genes
#'
#' @param net List object returned by \code{exp2gcn}.
#' @param genes Character vector containing a subset of genes from which
#' edges will be extracted. It can be ignored if the user wants to extract
#' an edge list for a given module instead of individual genes.
#' @param module Character with module name from which edges will be extracted.
#' To include 2 or more modules, input the names in a character vector.
#' @param filter Logical indicating whether to filter the edge list or not.
#' @param method Method to filter spurious correlations. One of "Zscore",
#' "optimalSFT", "pvalue" or "min_cor". See details for more information
#' on the methods. Default: 'optimalSFT'
#' @param r_optimal_test Numeric vector with the correlation thresholds
#' to be tested for optimal scale-free topology fit. Only valid
#' if \code{method} equals "optimalSFT". Default: seq(0.4, 0.9, by = 0.1)
#' @param Zcutoff Minimum Z-score threshold. Only valid if
#' \code{method} equals "Zscore". Default: 1.96
#' @param pvalue_cutoff Maximum P-value threshold. Only valid
#' if \code{method} equals "pvalue". Default: 0.05
#' @param rcutoff Minimum correlation threshold. Only valid
#' if \code{method} equals "min_cor". Default: 0.7
#' @param nSamples Number of samples in the data set from which
#' the correlation matrix was calculated. Only required
#' if \code{method} equals "pvalue".
#' @param check_SFT Logical indicating whether to test if the resulting network
#' is close to a scale-free topology or not. Default: FALSE.
#' @param bp_param BiocParallel back-end to be used.
#' Default: BiocParallel::SerialParam()
#'
#' @return Data frame with edge list for the input genes.
#' @details The default method ("optimalSFT") will create several different
#' edge lists by filtering the original correlation matrix by the thresholds
#' specified in \code{r_optimal_test}. Then, it will calculate a scale-free
#' topology fit index for each of the possible networks and return the network
#' that best fits the scale-free topology.
#' The method "Zscore" will apply a Fisher Z-transformation for the correlation
#' coefficients and remove the Z-scores below the threshold specified
#' in \code{Zcutoff}.
#' The method "pvalue" will calculate Student asymptotic p-value for the
#' correlations and remove correlations whose p-values are above the threshold
#' specified in \code{pvalue_cutoff}.
#' The method "min_cor" will remove correlations below the minimum correlation
#' threshold specified in \code{rcutoff}.
#' @seealso
#' \code{\link[WGCNA]{scaleFreeFitIndex}}
#' @seealso \code{SFT_fit}
#' @seealso \code{exp2gcn}.
#' @author Fabricio Almeida-Silva
#' @rdname get_edge_list
#' @export
#' @importFrom WGCNA scaleFreeFitIndex corPvalueStudent
#' @importFrom ggplot2 theme geom_point geom_line theme_bw
#' @importFrom BiocParallel bplapply SerialParam
#' @examples
#' data(filt.se)
#' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson")
#' genes <- rownames(filt.se)[1:50]
#' edges <- get_edge_list(gcn, genes=genes, filter = FALSE)
get_edge_list <- function(
net, genes = NULL, module = NULL, filter = FALSE,
method = "optimalSFT", r_optimal_test = seq(0.4, 0.9, by = 0.1),
Zcutoff = 1.96, pvalue_cutoff = 0.05, rcutoff = 0.7, nSamples = NULL,
check_SFT = FALSE, bp_param = BiocParallel::SerialParam()
) {
# Define objects containing correlation matrix and data frame of genes and modules
cor_matrix <- net$correlation_matrix
# Should we extract genes in a module?
if(!is.null(module)) {
genes_modules <- net$genes_and_modules
keep <- genes_modules[genes_modules$Modules %in% module, 1]
cor_matrix <- cor_matrix[keep, keep]
}
# Should we extract a user-defined gene set?
if(!is.null(genes)) {
cor_matrix <- cor_matrix[genes, genes]
}
# Should we filter the matrix?
if(filter) {
# Create edge list from correlation matrix
edges <- cormat_to_edgelist(cor_matrix)
colnames(edges) <- c("Gene1", "Gene2", "Weight")
if(method == "Zscore") {
# Apply Fisher-Z transformation to correlation values
edgesZ <- edges
edgesZ$Weight <- 0.5 * log((1+edges$Weight) / (1-edges$Weight))
edgelist <- edgesZ[edgesZ$Weight >= Zcutoff, ]
} else if(method == "optimalSFT") {
cutoff <- r_optimal_test
# Create list of 2 elements: edge lists, each with a different correlation threshold, and degree
list_mat <- replicate(length(cutoff), cor_matrix, simplify = FALSE)
list_cormat_filtered <- BiocParallel::bplapply(seq_along(list_mat), function(x) {
# Convert r values below threshold to NA and set diagonals to 0
matrix <- list_mat[[x]]
matrix[matrix < cutoff[x] ] <- NA
diag(matrix) <- 0
# Calculate degree
degree <- rowSums(matrix, na.rm=TRUE)
# Convert lower triangle and diagonals to NA
matrix[lower.tri(matrix, diag=TRUE)] <- NA
# Convert symmetrical matrix to edge list (Gene1, Gene2, Weight)
matrix <- na.omit(data.frame(as.table(matrix)))
result <- list(matrix = matrix, degree = degree)
return(result)
}, BPPARAM = bp_param)
degree_list <- lapply(list_cormat_filtered, function(x) return(x[[2]]))
# Calculate scale-free topology
sft.rsquared <- unlist(lapply(degree_list, function(x) WGCNA::scaleFreeFitIndex(x)$Rsquared.SFT))
max.index <- which.max(sft.rsquared)
# Plot scale-free topology fit for r values
plot.data <- data.frame(x = cutoff, y = sft.rsquared)
plot <- ggplot(plot.data, aes(x = .data$x, y = .data$y, group = 1)) +
geom_point(color = "firebrick", size = 4) +
geom_line(color = "firebrick", linewidth = 2) +
labs(
x = "Correlation (r) values",
y = expression(paste("Scale-free topology fit - ", R^{2})),
title = "Scale-free topology fit for given r values"
) +
theme_bw()
plot
optimalr <- cutoff[max.index]
message("The correlation threshold that best fits the scale-free topology is ", optimalr)
edgelist <- list_cormat_filtered[[max.index]][[1]]
} else if(method == "pvalue") {
if(is.null(nSamples)) {
stop("Please, specify the number of samples used to calculate the correlation matrix")
}
# Calculate Student asymptotic p-value for given correlations
cor.pvalue <- WGCNA::corPvalueStudent(edges$Weight, nSamples)
# Create a data frame of correlations and p-values
corandp <- edges
corandp$pvalue <- cor.pvalue
# Create a final edge list containing only significant correlations
edgelist <- corandp[corandp$pvalue < pvalue_cutoff, c(1,2,3)]
} else if(method == "min_cor") {
edgelist <- edges[edges$Weight >= rcutoff, ]
} else{
stop("Please, specify a valid filtering method.")
}
} else {
# Create edge list from correlation matrix without filtering
edgelist <- cormat_to_edgelist(cor_matrix)
colnames(edgelist) <- c("Gene1", "Gene2", "Weight")
}
# Check scale-free topology fit
if(check_SFT) {
test <- check_SFT(edgelist, net_type="gcn")
}
return(edgelist)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.