Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
out.width = "100%",
error = TRUE
)
knitr::opts_knit$set(global.par = TRUE)
## ---- include=FALSE-------------------------------------------------------------------------------
set.seed(1)
options(width = 100)
par(cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5)
## ----eval=FALSE-----------------------------------------------------------------------------------
# if(!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("proDA")
## ----eval=FALSE-----------------------------------------------------------------------------------
# # install.packages("devtools")
# devtools::install_github("const-ae/proDA")
## ----quickstart-----------------------------------------------------------------------------------
# Load the package
library(proDA)
# Generate some dataset with known structure
syn_dataset <- generate_synthetic_data(n_proteins = 100, n_conditions = 2)
# The abundance matrix
syn_dataset$Y[1:5, ]
# Assignment of the samples to the two conditions
syn_dataset$groups
# Fit the probabilistic dropout model
fit <- proDA(syn_dataset$Y, design = syn_dataset$groups)
# Identify which proteins differ between Condition 1 and 2
test_diff(fit, `Condition_1` - `Condition_2`, sort_by = "pval", n_max = 5)
## -------------------------------------------------------------------------------------------------
system.file("extdata/proteinGroups.txt", package = "proDA", mustWork = TRUE)
## -------------------------------------------------------------------------------------------------
# Load the table into memory
maxquant_protein_table <- read.delim(
system.file("extdata/proteinGroups.txt", package = "proDA", mustWork = TRUE),
stringsAsFactors = FALSE
)
## -------------------------------------------------------------------------------------------------
# I use a regular expression (regex) to select the intensity columns
intensity_colnames <- grep("^LFQ\\.intensity\\.", colnames(maxquant_protein_table), value=TRUE)
head(intensity_colnames)
# Create the intensity matrix
abundance_matrix <- as.matrix(maxquant_protein_table[, intensity_colnames])
# Adapt column and row maxquant_protein_table
colnames(abundance_matrix) <- sub("^LFQ\\.intensity\\.", "", intensity_colnames)
rownames(abundance_matrix) <- maxquant_protein_table$Protein.IDs
# Print some rows of the matrix with short names so they fit on the screen
abundance_matrix[46:48, 1:6]
## -------------------------------------------------------------------------------------------------
abundance_matrix[abundance_matrix == 0] <- NA
abundance_matrix <- log2(abundance_matrix)
abundance_matrix[46:48, 1:6]
## ----qc-mis_barplot, out.width="80%", fig.height=4, fig.align="center"---------------------------
barplot(colSums(is.na(abundance_matrix)),
ylab = "# missing values",
xlab = "Sample 1 to 36")
## ----qc-raw_boxplot, out.width="80%", fig.height=4, fig.align="center"----------------------------
boxplot(abundance_matrix,
ylab = "Intensity Distribution",
xlab = "Sample 1 to 36")
## -------------------------------------------------------------------------------------------------
normalized_abundance_matrix <- median_normalization(abundance_matrix)
## -------------------------------------------------------------------------------------------------
da <- dist_approx(normalized_abundance_matrix)
## ----sample_dist, out.width="60%", fig.align="center"---------------------------------------------
# This chunk only works if pheatmap is installed
# install.packages("pheatmap")
sel <- c(1:3, # CG1407
7:9, # CG59163
22:24)# CG6618
plot_mat <- as.matrix(da$mean)[sel, sel]
# Remove diagonal elements, so that the colorscale is not distorted
plot_mat[diag(9) == 1] <- NA
# 95% conf interval is approx `sd * 1.96`
uncertainty <- matrix(paste0(" ± ",round(as.matrix(da$sd * 1.96)[sel, sel], 1)), nrow=9)
pheatmap::pheatmap(plot_mat,
cluster_rows = FALSE, cluster_cols = FALSE,
display_numbers= uncertainty,
number_color = "black")
## -------------------------------------------------------------------------------------------------
# The best way to create this data.frame depends on the column naming scheme
sample_info_df <- data.frame(name = colnames(normalized_abundance_matrix),
stringsAsFactors = FALSE)
sample_info_df$condition <- substr(sample_info_df$name, 1, nchar(sample_info_df$name) - 3)
sample_info_df$replicate <- as.numeric(
substr(sample_info_df$name, nchar(sample_info_df$name) - 1, 20)
)
sample_info_df
## -------------------------------------------------------------------------------------------------
fit <- proDA(normalized_abundance_matrix, design = ~ condition,
col_data = sample_info_df, reference_level = "S2R")
fit
## -------------------------------------------------------------------------------------------------
# Equivalent to feature_parameters(fit)
fit$feature_parameters
## ----protein_dist, out.width="60%", fig.align="center"--------------------------------------------
# This chunk only works if pheatmap is installed
# install.packages("pheatmap")
pheatmap::pheatmap(dist_approx(fit[1:20, 1:3], by_sample = FALSE)$mean)
## -------------------------------------------------------------------------------------------------
# Test which proteins differ between condition CG1407 and S2R
# S2R is the default contrast, because it was specified as the `reference_level`
test_res <- test_diff(fit, "conditionCG1407")
test_res
## -------------------------------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.