#' Plot values for a protein of interest
#'
#' \code{plot_single} generates a barplot of a protein of interest.
#'
#' @param dep SummarizedExperiment,
#' Data object for which differentially enriched proteins are annotated
#' (output from \code{\link{test_diff}()} and \code{\link{add_rejections}()}).
#' @param proteins Character,
#' The name(s) of the protein(s) to plot.
#' @param type 'contrast' or 'centered',
#' The type of data scaling used for plotting.
#' Either the fold change ('contrast') or
#' the centered log2-intensity ('centered').
#' @param plot Logical(1),
#' If \code{TRUE} (default) the barplot is produced.
#' Otherwise (if \code{FALSE}), the summaries which the
#' barplot is based on are returned.
#' @return A barplot (generated by \code{\link[ggplot2]{ggplot}}).
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' # Plot single proteins
#' plot_single(dep, 'USP15')
#' plot_single(dep, 'USP15', 'centered')
#' plot_single(dep, c('USP15', 'CUL1'))
#' plot_single(dep, c('USP15', 'CUL1'), plot = FALSE)
#' @export
plot_single <- function(dep, proteins, type = c("contrast", "centered"), plot = TRUE) {
# Show error if inputs are not the required classes
assertthat::assert_that(inherits(dep, "SummarizedExperiment"),
is.character(proteins),
is.character(type),
is.logical(plot),
length(plot) == 1)
# Show error if inputs do not contain required columns
type <- match.arg(type)
row_data <- rowData(dep, use.names = FALSE)
if(any(!c("label", "condition", "replicate") %in% colnames(colData(dep)))) {
stop("'label', 'condition' and/or 'replicate' columns are not present in '",
deparse(substitute(dep)),
"'\nRun make_se() or make_se_parse() to obtain the required columns",
call. = FALSE)
}
if(length(grep("_p.adj|_diff", colnames(row_data))) < 1) {
stop("'[contrast]_diff' and '[contrast]_p.adj' columns are not present in '",
deparse(substitute(dep)),
"'\nRun test_diff() to obtain the required columns",
call. = FALSE)
}
if(!"name" %in% colnames(row_data)) {
stop("'name' column not present in '",
deparse(substitute(dep)),
"'\nRun make_se() or make_se_parse() to obtain the required columns",
call. = FALSE)
}
# Show error if an unvalid protein name is given
if(all(!proteins %in% row_data$name)) {
if(length(proteins) == 1) {
rows <- grep(substr(proteins, 1, nchar(proteins) - 1),row_data$name)
possibilities <- row_data$name[rows]
} else {
rows <- lapply(proteins, function(x)
grep(substr(x, 1, nchar(x) - 1),row_data$name))
possibilities <- row_data$name[unlist(rows)]
}
if(length(possibilities) > 0) {
possibilities_msg <- paste0(
"Do you mean: '",
paste0(possibilities, collapse = "', '"),
"'")
} else {
possibilities_msg <- NULL
}
stop("please run `plot_single()` with a valid protein names in the 'proteins' argument\n",
possibilities_msg,
call. = FALSE)
}
if(any(!proteins %in% row_data$name)) {
proteins <- proteins[proteins %in% row_data$name]
warning("Only used the following protein(s): '",
paste0(proteins, collapse = "', '"),
"'")
}
# Single protein
subset <- dep[proteins]
# Plot either the centered log-intensity values
# per condition ('centered') or the average fold change of conditions
# versus the control condition ('contrast') for a single protein
if(type == "centered") {
# Obtain protein-centered fold change values
means <- rowMeans(assay(subset), na.rm = TRUE)
df_reps <- data.frame(assay(subset) - means) %>%
rownames_to_column() %>%
gather(ID, val, -rowname) %>%
left_join(., data.frame(colData(subset)), by = "ID")
df_reps$replicate <- as.factor(df_reps$replicate)
df <- df_reps %>%
group_by(condition, rowname) %>%
summarize(mean = mean(val, na.rm = TRUE),
sd = sd(val, na.rm = TRUE),
n = n()) %>%
mutate(error = qnorm(0.975) * sd / sqrt(n),
CI.L = mean - error,
CI.R = mean + error) %>%
as.data.frame()
df$rowname <- parse_factor(df$rowname, levels = proteins)
# Plot the centered intensity values for the replicates and the mean
p <- ggplot(df, aes(condition, mean)) +
geom_hline(yintercept = 0) +
geom_col(colour = "black", fill = "grey") +
geom_point(data = df_reps, aes(condition, val, col = replicate),
shape = 18, size = 5, position = position_dodge(width=0.3)) +
geom_errorbar(aes(ymin = CI.L, ymax = CI.R), width = 0.3) +
labs(x = "Baits",
y = expression(log[2]~"Centered intensity"~"(\u00B195% CI)"),
col = "Rep") +
facet_wrap(~rowname) +
theme_DEP2()
}
if(type == "contrast") {
# Select values for a single protein
df <- rowData(subset, use.names = FALSE) %>%
data.frame() %>%
select(name,
ends_with("_diff"),
ends_with("_CI.L"),
ends_with("_CI.R")) %>%
gather(var, val, -name) %>%
mutate(contrast = gsub("_diff|_CI.L|_CI.R", "", var),
var = gsub(".*_", "", var)) %>%
spread(var, val)
df$name <- parse_factor(df$name, levels = proteins)
suffix <- get_suffix(df$contrast)
if(length(suffix)) {df$contrast <- delete_suffix(df$contrast)}
# Plot the average fold change of conditions versus the control condition
p <- ggplot(df, aes(contrast, diff)) +
geom_hline(yintercept = 0) +
geom_col(colour = "black", fill = "grey") +
geom_errorbar(aes(ymin = CI.L, ymax = CI.R), width = 0.3) +
labs(x = suffix,
y = expression(log[2]~"Fold change"~"(\u00B195% CI)")) +
facet_wrap(~name) +
theme_DEP2()
}
if(plot) {
return(p)
} else {
if(type == "centered") {
df <- df %>%
select(rowname, condition, mean, CI.L, CI.R)
colnames(df) <- c("protein", "condition",
"log2_intensity", "CI.L", "CI.R")
}
if(type == "contrast") {
df <- df %>%
select(name, contrast, diff, CI.L, CI.R) %>%
mutate(contrast = paste0(contrast, suffix))
colnames(df) <- c("protein", "contrast",
"log2_fold_change", "CI.L", "CI.R")
}
return(df)
}
}
# Internal function to get ComplexHeatmap::HeatmapAnnotation object
get_annotation <- function(dep, indicate) {
assertthat::assert_that(
inherits(dep, "SummarizedExperiment"),
is.character(indicate))
# Check indicate columns
col_data <- colData(dep) %>%
as.data.frame()
columns <- colnames(col_data)
if(all(!indicate %in% columns)) {
stop("'",
paste0(indicate, collapse = "' and/or '"),
"' column(s) is/are not present in ",
deparse(substitute(dep)),
".\nValid columns are: '",
paste(columns, collapse = "', '"),
"'.",
call. = FALSE)
}
if(any(!indicate %in% columns)) {
indicate <- indicate[indicate %in% columns]
warning("Only used the following indicate column(s): '",
paste0(indicate, collapse = "', '"),
"'")
}
# Get annotation
anno <- select(col_data, indicate)
# Annotation color
names <- colnames(anno)
anno_col <- vector(mode="list", length=length(names))
names(anno_col) <- names
for(i in names) {
var = anno[[i]] %>% unique() %>% sort()
if(length(var) == 1)
cols <- c("black")
if(length(var) == 2)
cols <- c("orangered", "cornflowerblue")
if(length(var) < 7 & length(var) > 2)
cols <- RColorBrewer::brewer.pal(length(var), "Pastel1")
if(length(var) > 7)
cols <- RColorBrewer::brewer.pal(length(var), "Set3")
names(cols) <- var
anno_col[[i]] <- cols
}
# HeatmapAnnotation object
HeatmapAnnotation(df = anno,
col = anno_col,
show_annotation_name = TRUE)
}
#' Plot a heatmap
#'
#' \code{plot_heatmap} generates a heatmap of all significant proteins.
#'
#' @param dep SummarizedExperiment,
#' Data object for which differentially enriched proteins are annotated
#' (output from \code{\link{test_diff}()} and \code{\link{add_rejections}()}).
#' @param type 'contrast' or 'centered',
#' The type of data scaling used for plotting.
#' Either the fold change ('contrast') or
#' the centered log2-intensity ('centered').
#' @param kmeans Logical(1),
#' Whether or not to perform k-means clustering.
#' @param k Integer(1),
#' Sets the number of k-means clusters.
#' @param col_limit Integer(1),
#' Sets the outer limits of the color scale.
#' @param indicate Character,
#' Sets additional annotation on the top of the heatmap
#' based on columns from the experimental design (colData).
#' Only applicable to type = 'centered'.
#' @param clustering_distance "euclidean", "maximum", "manhattan", "canberra",
#' "binary", "minkowski", "pearson", "spearman", "kendall" or "gower",
#' Function used to calculate clustering distance (for proteins and samples).
#' Based on \code{\link[ComplexHeatmap]{Heatmap}}
#' and \code{\link[cluster]{daisy}}.
#' @param row_font_size Integer(1),
#' Sets the size of row labels.
#' @param col_font_size Integer(1),
#' Sets the size of column labels.
#' @param plot Logical(1),
#' If \code{TRUE} (default) the heatmap is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' heatmap is based on are returned.
#' @param ... Additional arguments for Heatmap function as depicted in
#' \code{\link[ComplexHeatmap]{Heatmap}}
#' @return A heatmap (generated by \code{\link[ComplexHeatmap]{Heatmap}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' # Plot heatmap
#' plot_heatmap(dep)
#' plot_heatmap(dep, 'centered', kmeans = TRUE, k = 6, row_font_size = 3)
#' plot_heatmap(dep, 'contrast', col_limit = 10, row_font_size = 3)
#' @export
plot_heatmap <- function(dep, type = c("contrast", "centered"),
kmeans = FALSE, k = 6,
col_limit = 6, indicate = NULL,
clustering_distance = c("euclidean", "maximum", "manhattan", "canberra",
"binary", "minkowski", "pearson", "spearman", "kendall", "gower"),
row_font_size = 6, col_font_size = 10, plot = TRUE, ...) {
# Show error if inputs are not the required classes
if(is.integer(k)) k <- as.numeric(k)
if(is.integer(col_limit)) col_limit <- as.numeric(col_limit)
if(is.integer(row_font_size)) row_font_size <- as.numeric(row_font_size)
if(is.integer(col_font_size)) col_font_size <- as.numeric(col_font_size)
assertthat::assert_that(inherits(dep, "SummarizedExperiment"),
is.character(type),
is.logical(kmeans),
is.numeric(k),
length(k) == 1,
is.numeric(col_limit),
length(col_limit) == 1,
is.numeric(row_font_size),
length(row_font_size) == 1,
is.numeric(col_font_size),
length(col_font_size) == 1,
is.logical(plot),
length(plot) == 1)
# Show error if inputs do not contain required columns
type <- match.arg(type)
clustering_distance <- match.arg(clustering_distance)
# Extract row and col data
row_data <- rowData(dep, use.names = FALSE)
col_data <- colData(dep) %>%
as.data.frame()
# Show error if inputs do not contain required columns
if(any(!c("label", "condition", "replicate") %in% colnames(col_data))) {
stop(paste0("'label', 'condition' and/or 'replicate' columns are not present in '",
deparse(substitute(dep)), "'"),
call. = FALSE)
}
if(length(grep("_diff", colnames(row_data))) < 1) {
stop(paste0("'[contrast]_diff' columns are not present in '",
deparse(substitute(dep)),
"'.\nRun test_diff() to obtain the required columns."),
call. = FALSE)
}
if(!"significant" %in% colnames(row_data)) {
stop(paste0("'significant' column is not present in '",
deparse(substitute(dep)),
"'.\nRun add_rejections() to obtain the required column."),
call. = FALSE)
}
# Heatmap annotation
if(!is.null(indicate) & type == "contrast") {
warning("Heatmap annotation only applicable for type = 'centered'",
call. = FALSE)
}
if(!is.null(indicate) & type == "centered") {
ha1 <- get_annotation(dep, indicate)
} else {
ha1 <- NULL
}
# Filter for significant proteins only
filtered <- dep[row_data$significant, ]
# Check for missing values
if(any(is.na(assay(filtered)))) {
warning("Missing values in '", deparse(substitute(dep)), "'. ",
"Using clustering_distance = 'gower'",
call. = FALSE)
clustering_distance <- "gower"
obs_NA <- TRUE
} else {
obs_NA <- FALSE
}
# Get centered intensity values ('centered')
if(type == "centered") {
rowData(filtered)$mean <- rowMeans(assay(filtered), na.rm = TRUE)
df <- assay(filtered) - rowData(filtered, use.names = FALSE)$mean
}
# Get contrast fold changes ('contrast')
if(type == "contrast") {
df <- rowData(filtered, use.names = FALSE) %>%
data.frame() %>%
column_to_rownames(var = "name") %>%
select(ends_with("_diff"))
colnames(df) <-
gsub("_diff", "", colnames(df)) %>%
gsub("_vs_", " vs ", .)
df <- as.matrix(df)
}
# Facultative kmeans clustering
if(kmeans & obs_NA) {
warning("Cannot perform kmeans clustering with missing values",
call. = FALSE)
kmeans <- FALSE
}
if(kmeans & !obs_NA) {
set.seed(1)
df_kmeans <- kmeans(df, k)
if(type == "centered") {
# Order the k-means clusters according to the maximum fold change
# in all samples averaged over the proteins in the cluster
order <- data.frame(df) %>%
cbind(., cluster = df_kmeans$cluster) %>%
mutate(row = apply(.[, seq_len(ncol(.) - 1)], 1, function(x) max(x))) %>%
group_by(cluster) %>%
summarize(index = sum(row)/n()) %>%
arrange(desc(index)) %>%
pull(cluster) %>%
match(seq_len(k), .)
df_kmeans$cluster <- order[df_kmeans$cluster]
}
if(type == "contrast") {
# Order the k-means clusters according to their average fold change
order <- data.frame(df) %>%
cbind(df, cluster = df_kmeans$cluster) %>%
gather(condition, diff, -cluster) %>%
group_by(cluster) %>%
summarize(row = mean(diff)) %>%
arrange(desc(row)) %>%
pull(cluster) %>%
match(seq_len(k), .)
df_kmeans$cluster <- order[df_kmeans$cluster]
}
}
if(ncol(df) == 1) {
col_clust = FALSE
} else {
col_clust = TRUE
}
if(nrow(df) == 1) {
row_clust = FALSE
} else {
row_clust = TRUE
}
if(clustering_distance == "gower") {
clustering_distance <- function(x) {
dist <- cluster::daisy(x, metric = "gower")
dist[is.na(dist)] <- max(dist, na.rm = TRUE)
return(dist)
}
}
# Legend info
legend <- ifelse(type == "contrast",
"log2 Fold change",
"log2 Centered intensity")
# Heatmap
ht1 = Heatmap(df,
col = circlize::colorRamp2(
seq(-col_limit, col_limit, (col_limit/5)),
rev(RColorBrewer::brewer.pal(11, "RdBu"))),
split = if(kmeans) {df_kmeans$cluster} else {NULL},
cluster_rows = col_clust,
cluster_columns = row_clust,
row_names_side = "left",
column_names_side = "top",
clustering_distance_rows = clustering_distance,
clustering_distance_columns = clustering_distance,
heatmap_legend_param = list(color_bar = "continuous",
legend_direction = "horizontal",
legend_width = unit(5, "cm"),
title_position = "lefttop"),
name = legend,
row_names_gp = gpar(fontsize = row_font_size),
column_names_gp = gpar(fontsize = col_font_size),
top_annotation = ha1,
...)
if(plot) {
# Plot
draw(ht1, heatmap_legend_side = "top")
} else {
# Return data.frame
colnames(df) <- gsub(" ", "_", colnames(df))
df <- df[, unlist(column_order(ht1))]
if(kmeans) {
df <- cbind(df, k = df_kmeans$cluster)
}
return <- df[unlist(row_order(ht1)),]
data.frame(protein = row.names(return), return) %>%
mutate(order = row_number())
}
}
#' Volcano plot
#'
#' \code{plot_volcano} generates a volcano plot for a specified contrast.
#'
#' @param dep SummarizedExperiment,
#' Data object for which differentially enriched proteins are annotated
#' (output from \code{\link{test_diff}()} and \code{\link{add_rejections}()}).
#' @param contrast Character(1),
#' Specifies the contrast to plot.
#' @param label_size Integer(1),
#' Sets the size of name labels.
#' @param add_names Logical(1),
#' Whether or not to plot names.
#' @param adjusted Logical(1),
#' Whether or not to use adjusted p values.
#' @param plot Logical(1),
#' If \code{TRUE} (default) the volcano plot is produced.
#' Otherwise (if \code{FALSE}), the data which the
#' volcano plot is based on are returned.
#' @return A volcano plot (generated by \code{\link[ggplot2]{ggplot}})
#' @examples
#' # Load example
#' data <- UbiLength
#' data <- data[data$Reverse != "+" & data$Potential.contaminant != "+",]
#' data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#'
#' # Make SummarizedExperiment
#' columns <- grep("LFQ.", colnames(data_unique))
#' exp_design <- UbiLength_ExpDesign
#' se <- make_se(data_unique, columns, exp_design)
#'
#' # Filter, normalize and impute missing values
#' filt <- filter_missval(se, thr = 0)
#' norm <- normalize_vsn(filt)
#' imputed <- impute(norm, fun = "MinProb", q = 0.01)
#'
#' # Test for differentially expressed proteins
#' diff <- test_diff(imputed, "control", "Ctrl")
#' dep <- add_rejections(diff, alpha = 0.05, lfc = 1)
#'
#' # Plot volcano
#' plot_volcano(dep, 'Ubi6_vs_Ctrl', label_size = 5, add_names = TRUE)
#' plot_volcano(dep, 'Ubi6_vs_Ctrl', label_size = 5,
#' add_names = TRUE, adjusted = TRUE)
#' plot_volcano(dep, 'Ubi6_vs_Ctrl', add_names = FALSE)
#' plot_volcano(dep, 'Ubi4_vs_Ctrl', label_size = 5, add_names = TRUE)
#' @export
plot_volcano <- function(dep, contrast, label_size = 3,
add_names = TRUE, adjusted = FALSE, plot = TRUE) {
# Show error if inputs are not the required classes
if(is.integer(label_size)) label_size <- as.numeric(label_size)
assertthat::assert_that(inherits(dep, "SummarizedExperiment"),
is.character(contrast),
length(contrast) == 1,
is.numeric(label_size),
length(label_size) == 1,
is.logical(add_names),
length(add_names) == 1,
is.logical(adjusted),
length(adjusted) == 1,
is.logical(plot),
length(plot) == 1)
row_data <- rowData(dep, use.names = FALSE)
# Show error if inputs do not contain required columns
if(any(!c("name", "ID") %in% colnames(row_data))) {
stop(paste0("'name' and/or 'ID' columns are not present in '",
deparse(substitute(dep)),
"'.\nRun make_unique() to obtain required columns."),
call. = FALSE)
}
if(length(grep("_p.adj|_diff", colnames(row_data))) < 1) {
stop(paste0("'[contrast]_diff' and '[contrast]_p.adj' columns are not present in '",
deparse(substitute(dep)),
"'.\nRun test_diff() to obtain the required columns."),
call. = FALSE)
}
if(length(grep("_significant", colnames(row_data))) < 1) {
stop(paste0("'[contrast]_significant' columns are not present in '",
deparse(substitute(dep)),
"'.\nRun add_rejections() to obtain the required columns."),
call. = FALSE)
}
# Show error if an unvalid contrast is given
if (length(grep(paste(contrast, "_diff", sep = ""),
colnames(row_data))) == 0) {
valid_cntrsts <- row_data %>%
data.frame() %>%
select(ends_with("_diff")) %>%
colnames(.) %>%
gsub("_diff", "", .)
valid_cntrsts_msg <- paste0("Valid contrasts are: '",
paste0(valid_cntrsts, collapse = "', '"),
"'")
stop("Not a valid contrast, please run `plot_volcano()` with a valid contrast as argument\n",
valid_cntrsts_msg,
call. = FALSE)
}
# Generate a data.frame containing all info for the volcano plot
diff <- grep(paste(contrast, "_diff", sep = ""),
colnames(row_data))
if(adjusted) {
p_values <- grep(paste(contrast, "_p.adj", sep = ""),
colnames(row_data))
} else {
p_values <- grep(paste(contrast, "_p.val", sep = ""),
colnames(row_data))
}
signif <- grep(paste(contrast, "_significant", sep = ""),
colnames(row_data))
df <- data.frame(x = row_data[, diff],
y = -log10(row_data[, p_values]),
significant = row_data[, signif],
name = row_data$name) %>%
filter(!is.na(significant)) %>%
arrange(significant)
name1 <- gsub("_vs_.*", "", contrast)
name2 <- gsub(".*_vs_", "", contrast)
# Plot volcano with or without labels
p <- ggplot(df, aes(x, y)) +
geom_vline(xintercept = 0) +
geom_point(aes(col = significant)) +
geom_text(data = data.frame(), aes(x = c(Inf, -Inf),
y = c(-Inf, -Inf),
hjust = c(1, 0),
vjust = c(-1, -1),
label = c(name1, name2),
size = 5,
fontface = "bold")) +
labs(title = contrast,
x = expression(log[2]~"Fold change")) +
theme_DEP1() +
theme(legend.position = "none") +
scale_color_manual(values = c("TRUE" = "black", "FALSE" = "grey"))
if (add_names) {
p <- p + ggrepel::geom_text_repel(data = filter(df, significant),
aes(label = name),
size = label_size,
box.padding = unit(0.1, 'lines'),
point.padding = unit(0.1, 'lines'),
segment.size = 0.5)
}
if(adjusted) {
p <- p + labs(y = expression(-log[10]~"Adjusted p-value"))
} else {
p <- p + labs(y = expression(-log[10]~"P-value"))
}
if(plot) {
return(p)
} else {
df <- df %>%
select(name, x, y, significant) %>%
arrange(desc(x))
colnames(df)[c(1,2,3)] <- c("protein", "log2_fold_change", "p_value_-log10")
if(adjusted) {
colnames(df)[3] <- "adjusted_p_value_-log10"
}
return(df)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.