#' @title Analysis of the Relative Quantifications
#'
#' @description Analysis of relative quantifications, including:
#'
#' - Annotations
#' - Summary files in different format (xls, txt) and shapes (long, wide)
#' - Numerous summary plots
#' - Enrichment analysis using Gprofiler
#' - PCA of quantifications
#' - Clustering analysis
#' - Basic imputation of missing values
#'
#' To run this function, the following packages must be installed on your system:
#' - From bioconductor:
#' `BiocManager::install(c("ComplexHeatmap", "org.Mm.eg.db"))`
#' - From CRAN:
#' `install.packages(c("factoextra", "FactoMineR", "gProfileR", "PerformanceAnalytics"))`
#'
#' @param log2fc_file (char) MSstats results file location
#' @param modelqc_file (char) MSstats modelqc file location
#' @param species (char) Select one species. Species currently supported for
#' a full analysis (including enrichment analysis):
#' - HUMAN
#' - MOUSE
#' To find out species supported only for annotation check
#' `?artmsIsSpeciesSupported()`
#'
#' @param output_dir (char) Name for the folder to output the results from the
#' function. Default is current directory (recommended to provide a new folder
#' name).
#' @param outliers (char) It allows to keep or remove outliers. Options:
#' - `keep` (default): it keeps outliers 'keep', 'iqr', 'std'
#' - `iqr` (recommended): remove outliers +/- 6 x Interquartile Range (IQR)
#' - `std` : 6 x standard deviation
#' @param enrich (logical) Performed enrichment analysis using GprofileR?
#' Only available for species HUMAN and MOUSE.
#' `TRUE` (default if "human" or "mouse" are the species) or `FALSE`
#' @param l2fc_thres (int) log2fc cutoff for enrichment analysis (default,
#' `l2fc_thres = 1.5`)
#' @param choosePvalue (char) specify whether `pvalue` or `adjpvalue` should
#' use for the analysis. The default option is `adjpvalue`
#' (multiple testing correction).
#' But if the number of biological replicates for a given experiment is
#' too low (for example n = 2), then `choosePvalue = pvalue` is recommended.
#' @param isBackground (char) background of gene names for enrichment analysis.
#' `nobackground` (default) will use the total number of genes detected.
#' Alternatively provided the file path name to the background gene list.
#' @param isPtm (char) Is a ptm-site quantification?
#' - `global` (default),
#' - `ptmsites` (for site specific analysis),
#' - `ptmph` (Jeff Johnson script output evidence file)
#' @param mnbr (int) **PARAMETER FOR NAIVE IMPUTATION**:
#' "minimal number of biological replicates" for "naive
#' imputation" and filtering. Default: `mnbr = 2`. Details:
#' Intensity values for proteins/PTMs that are completely missed in one
#' of the two conditions compared ("condition A"), but are found in
#' at least 2 biological replicates (`mnbr = 2`) of the other "condition B",
#' are imputed (values artificially assigned) and the log2FC values calculated.
#' The goal is to keep those proteins/PTMs that are consistently found in
#' one of the two conditions (in this case "condition B") and facilitate the
#' inclusion in downstream analysis (if wished). The imputed intensity values
#' are sampled from the lowest intensity values detected in the experiment,
#' and (**WARNING**) the p-values are just randomly assigned between
#' 0.05 and 0.01 for illustration purposes (when generating a volcano
#' plot with the output of `artmsAnalysisQuantifications`) or to include
#' them when making a cutoff of p-value < 0.05 for enrichment analysis
#' **CAUTION**: `mnbr` would also add the constraint that any protein
#' must be identified in at least `nmbr` biological replicates of the
#' **same** condition or it will be filtered out. That is,
#' if `mnbr = 2`, a protein found in two conditions but only in one
#' biological replicate in each of them, it would be removed.
#' @param pathogen (char) Is there a pathogen in the dataset as well?
#' if it does not, then use `pathogen = nopathogen` (default).
#' Pathogens available: `tb` (Tuberculosis), `lpn` (Legionella)
#' @param plotPvaluesLog2fcDist (logical) If `TRUE` (default) plots pvalues
#' and log2fc distributions
#' @param plotAbundanceStats (logical) If `TRUE` (default) plots stats graphs
#' about abundance values
#' @param plotReproAbundance (logical) If `TRUE` plots reproducibility
#' based on normalized abundance values
#' @param plotCorrConditions (logical) If `TRUE` plots correlation
#' between the different conditions
#' @param plotCorrQuant (logical) if `TRUE` plots correlation between the
#' available quantifications (comparisons)
#' @param plotPCAabundance (logical) if `TRUE` performs PCA analysis of
#' conditions using normalized abundance values
#' @param plotFinalDistributions (logical) if `TRUE` plots distribution of both
#' log2fc and pvalues
#' @param plotPropImputation (logical) if `TRUE` plots proportion of overall
#' imputation
#' @param plotHeatmapsChanges (logical) if `TRUE` plots heatmaps of quantified
#' changes (both all and significant only). Only if `printPDF` is also `TRUE`
#' @param plotTotalQuant (logical) if `TRUE` plots barplot of total number of
#' quantifications per comparison
#' @param plotClusteringAnalysis (logical) if `TRUE` performs clustering
#' analysis between quantified comparisons (more than 1 comparison required)
#' @param data_object (logical) flag to indicate whether the required files
#' are data objects. Default is FALSE
#' @param printPDF If `TRUE` (default) prints out the pdfs. Warning: plot
#' objects are not returned due to the large number of them.
#' @param verbose (logical) `TRUE` (default) shows function messages
#' @return (data.frame) summary of quantifications, including annotations,
#' enrichments, etc
#' @keywords analysis, quantifications
#' @examples
#' # Testing that the files cannot be empty
#' artmsAnalysisQuantifications(log2fc_file = NULL,
#' modelqc_file = NULL,
#' species = NULL,
#' output_dir = NULL)
#' @export
artmsAnalysisQuantifications <- function(log2fc_file,
modelqc_file,
species,
output_dir = "analysis_quant",
outliers = c("keep", "iqr", "std"),
enrich = TRUE,
l2fc_thres = 1,
choosePvalue = c("adjpvalue","pvalue"),
isBackground = "nobackground",
isPtm = "global",
mnbr = 2,
pathogen = "nopathogen",
plotPvaluesLog2fcDist = TRUE,
plotAbundanceStats = TRUE,
plotReproAbundance = TRUE,
plotCorrConditions = TRUE,
plotCorrQuant = TRUE,
plotPCAabundance = TRUE,
plotFinalDistributions = TRUE,
plotPropImputation = TRUE,
plotHeatmapsChanges = TRUE,
plotTotalQuant = TRUE,
plotClusteringAnalysis = TRUE,
data_object = FALSE,
printPDF = TRUE,
verbose = TRUE
) {
Uniprot_PTM = imputed = Gene_Protein = NULL
if(verbose){
message("---------------------------------------------")
message("artMS: ANALYSIS OF QUANTIFICATIONS")
message("---------------------------------------------")
}
# Checking arguments-----
if(any(missing(log2fc_file) |
missing(modelqc_file) |
missing(species) ))
stop("One (or many) of the required arguments missed.
Please, check the help for this function to find out more")
if(is.null(log2fc_file) &
is.null(modelqc_file) &
is.null(species) &
is.null(output_dir)){
return("The evidence_file, modelqc_file, species and output_dir arguments cannot be NULL")
}
# CHECK POINT: DO THE FILES EXIST?
if (!is.logical(data_object)) {
stop(" Argument <data_object> must be logical (TRUE or FALSE) ")
}
if(!data_object){
if(!file.exists(log2fc_file)){
stop("THE FILE ", log2fc_file, " DOES NOT EXIST ")
}
if(!file.exists(modelqc_file)){
stop("THE FILE ", modelqc_file, " DOES NOT EXIST ")
}
}
if (!is.logical(enrich)) {
stop(" Argument <enrich> must be logical (TRUE or FALSE) ")
}
if (!is.numeric(l2fc_thres)) {
stop(" Argument <l2fc_thres> must be numeric ")
}
if (!is.numeric(mnbr)) {
stop(" Argument <mnbr> must be numeric ")
}
if(!(isPtm %in% c('global', 'ptmph', 'ptmsites'))){
stop("The < isPtm > argument is wrong.
The valid options are: <global> or <ptmsites> ")
}
outliers <- tolower(outliers)
outliers <- match.arg(outliers)
choosePvalue <- tolower(choosePvalue)
choosePvalue <- match.arg(choosePvalue)
# Check supported species------
if(isFALSE(artmsIsSpeciesSupported(species))){
if(verbose) message("----(-) ", species, " is not supported. \n\tGene Symbol, Protein Name, and EntrezID won't be provided")
}
species <- tolower(species)
if(!(species %in% c('human', 'mouse'))){
if(enrich){
message("--- Enrichment analysis turned off (only available for HUMAN and MOUSE)")
enrich <- FALSE
}
}
# Checking required packages-----
## Bioconductor-----
cp <- 0
if(species == "human"){
if (!"org.Hs.eg.db" %in% installed.packages()){
message(paste0('- Package < org.Hs.eg.db > not installed in your system.
Please, install running < BiocManager::install(\"org.Hs.eg.db\") >'))
cp = cp + 1
}
}else if(species == "mouse"){
if (!"org.Mm.eg.db" %in% installed.packages()){
message(paste0('- Package < org.Mm.eg.db > not installed in your system.
Please, install running < BiocManager::install(\"org.Mm.eg.db\") >'))
cp = cp + 1
}
}
## CRAN -----
required_bioc_packages <- c("ComplexHeatmap")
for(p in required_bioc_packages){
if(!(p %in% installed.packages())){
message(paste0('- Package < ', p, ' > not installed in your system.
Please, install running < BiocManager::install(\"',p,'\") >'))
cp <- cp + 1
}
}
required_cran_packages <- c("factoextra", "FactoMineR", "gProfileR", "PerformanceAnalytics")
for(p in required_cran_packages){
if(!(p %in% installed.packages())){
message(paste0('- Package < ', p, ' > not installed in your system.
Please, install running < install.packages(\"',p,'\") >'))
cp <- cp + 1
}
}
if(cp > 0){
stop("\nRequired packages to run artmsAnalysisQuantifications() not available. Please, install and run again")
}
# Check pathogens-----
pathogen <- tolower(pathogen)
if (pathogen == "nopathogen") {
if(verbose) message("--- No Pathogen extra in these samples")
} else if (pathogen == "tb") {
# This should not work
if(verbose) message("\nPATHOGEN IN SAMPLES: Tuberculosis (TB)\n")
pathogen.ids <- artms_data_pathogen_TB
names(pathogen.ids) <- c('Entry')
} else if (pathogen == "lpn") {
if(verbose) message("\nPATHOGEN IN SAMPLES: LEGIONELLA PNEUMOPHILA\n")
pathogen.ids <- artms_data_pathogen_LPN
} else{
stop("This pathogen is not supported yet")
}
# Session info-----
session <- sessionInfo()
sink("artms_sessionInfo_analysisQuant.txt")
print(session)
sink()
# Create dir-----
log2dirname <- normalizePath(dirname(log2fc_file))
output_dir <- file.path(log2dirname, paste0(output_dir, "_", choosePvalue))
if(!dir.exists(file.path(output_dir))){
dir.create(file.path(output_dir), recursive = TRUE)
}
# Create the ouput_directory in the results folder
# dirname(log2fc_file)
# getwd()
# Open log2fc-----
if(verbose) message(">> LOADING QUANTIFICATIONS (-results.txt from MSstats) ")
if(data_object){
dflog2fcraw <- log2fc_file
log2fc_file <- "analysis-results.txt"
}else{
dflog2fcraw <- read.delim(log2fc_file,
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE)
}
# use this as a template to generate the output file names
log2fc_file <- basename(log2fc_file)
# Open modelqc ----
if(verbose) message(">> LOADING modelqc FILE (ABUNDANCE) ")
if(data_object){
dfmq <- modelqc_file
remove(modelqc_file)
}else{
dfmq <- read.delim(modelqc_file,
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE)
colnames(dfmq) <- toupper(colnames(dfmq))
}
# ModelQC: fix old files-----
if("GROUP_ORIGINAL" %in% colnames(dfmq)){
if(verbose) message("\n------------------------------------------------------------")
if(verbose) message("(!) WARNING!")
if(verbose) message("Be aware that you are analyzing a dataset that was generated with an old version of MSstats (< 4.0)")
if(verbose) message("Some changes will be applied")
if(verbose) message(" ------------------------------------------------------------\n")
dfmq$GROUP <- NULL
dfmq$GROUP <- dfmq$GROUP_ORIGINAL
dfmq$SUBJECT <- NULL
dfmq$SUBJECT <- dfmq$SUBJECT_ORIGINAL
}
# Removing the empty protein names-----
if (any(dfmq$PROTEIN == "")) {
dfmq <- dfmq[-which(dfmq$PROTEIN == ""), ]
if(verbose) message("--- Empty ID proteins removed from abundance data")
}
dfmq$PROTEIN <- gsub("(sp\\|)(.*)(\\|.*)", "\\2", dfmq$PROTEIN)
dfmq$PROTEIN <- gsub("(.*)(\\|.*)", "\\1", dfmq$PROTEIN)
# Check protein ID----
if( length(dfmq$PROTEIN[grep("\\w{2}_\\d{1,}\\.\\d{1,}", dfmq$PROTEIN)]) > 100 ){
if(verbose) message("----(-) Many RefSeq IDs detected in this dataset, which is not supported yet.
Gene Symbol, Protein Name, and EntrezID won't be provided")
}
# Remove outliers------
if (outliers == "iqr" | outliers == "std"){
if(outliers == "iqr"){
if(verbose) message("--- Removing outliers (based on 6xIQR cutoff)")
iqr <- IQR(dfmq$ABUNDANCE)
m <- mean(dfmq$ABUNDANCE)
uplim <- m+(6*iqr)
lowlim <- m-(6*iqr)
}else if(outliers == "std"){
if(verbose) message("--- Removing outliers (based on 6xSTD cutoff)")
std <- sd(dfmq$ABUNDANCE)
m <- mean(dfmq$ABUNDANCE)
uplim <- m+3*std
lowlim <- m-3*std
}
# Record outliers for plotting
dfmq$outliers <- ifelse(dfmq$ABUNDANCE > lowlim & dfmq$ABUNDANCE < uplim, "no", "yes")
outliers_number <- length(dfmq$outliers[which(dfmq$outliers == "yes")])
# Report outliers removed
if (outliers_number == 0){
if(verbose) message("------> No outliers have been removed")
}else{
if(verbose) message("------> Total outliers removed: ", outliers_number)
out.outliers <- gsub(".txt", paste0("-outliers-removed-",outliers,".txt"), log2fc_file)
out.outliers <- paste0(output_dir, "/", out.outliers)
dfmq.outliers <- dfmq[which(dfmq$outliers == "yes"),]
write.table(
dfmq.outliers,
out.outliers,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
# PLOT Outliers
k <- ggplot(dfmq, aes(x = SUBJECT, y = ABUNDANCE, color = outliers))
k <- k + geom_jitter(width = 0.3, size = 0.5, na.rm = TRUE)
k <- k + theme_minimal()
k <- k + ggtitle("Distribution of ABUNDANCE with outliers")
k <-k + theme(axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5
))
# Distribution without ouliers
l <- ggplot(dfmq[which(dfmq$outliers == "no"),], aes(x = SUBJECT, y = ABUNDANCE))
l <- l + geom_jitter(width = 0.3, size = 0.5, na.rm = TRUE)
l <- l + theme_minimal()
l <- l + ggtitle("Distribution of ABUNDANCE (outliers removed)")
l <- l + theme(axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5
))
out.outliers <- gsub(".txt", paste0("-outliers-removed-",outliers,".pdf"), log2fc_file)
out.outliers <- paste0(output_dir, "/", out.outliers)
if(printPDF) pdf(out.outliers)
plot(k)
plot(l)
if(printPDF) garbage <- dev.off()
if(verbose) message("------> Check table and plot to find out more about outliers removed")
# Remove outliers
dfmq <- dfmq[which(dfmq$ABUNDANCE > lowlim & dfmq$ABUNDANCE < uplim), ]
}
}else{
if(verbose) message("--- Outliers kept (user selection)")
dfmq <- dfmq
}
# Conditions----
conditions <- unique(dfmq$GROUP)
numberConditions <- length(conditions)
# KEY STEP: getting background gene list-----
if (isBackground == "nobackground") {
# If not list of background genes is provided,
# then extract them from the modelqc file
if (isPtm == "global") {
suppressMessages(
dfmq2Genes <- artmsAnnotationUniprot(dfmq, 'PROTEIN', species)
)
numberTotalGenes <- length(unique(dfmq2Genes$Gene))
if(verbose) message("--- Total number of genes/proteins: ", numberTotalGenes, " ")
listOfGenes <- unique(dfmq2Genes$Gene)
} else if (grepl("ptm", isPtm)) {
dfmq2Genes <- dfmq[c('PROTEIN', 'GROUP')]
names(dfmq2Genes)[grep('PROTEIN', names(dfmq2Genes))] <- 'Protein'
# # Removing party sites
# dfmq2Genes <- dfmq2Genes[grep(",", dfmq2Genes$Protein, invert = TRUE), ]
dfmq2Genes <- .artmsExtractUniprotId(x = dfmq2Genes,
uniprotPtmColumn = "Protein",
newColumnName = "Protein")
dfmq2Genes <- unique(dfmq2Genes)
suppressMessages(
dfmq2Genes <- artmsAnnotationUniprot(dfmq2Genes, 'Protein', species)
)
numberTotalGenes <- length(unique(dfmq2Genes$Gene))
if(verbose) message("--- TOTAL NUMBER OF GENES/PROTEINS: ",
numberTotalGenes, " ")
if (numberTotalGenes == 0) {
stop("IDs are not recognized")
}
listOfGenes <- unique(dfmq2Genes$Gene)
}
} else{
# No matter what list is provided, it must come with a "Gene" column
backgroundList <- read.delim(isBackground,
header = TRUE,
sep = "\t",
quote = "",
stringsAsFactors = FALSE)
listOfGenes <- unique(backgroundList$Gene)
}
backgroundNumber <- length(listOfGenes)
# Process log2fc----
if (any(dflog2fcraw$Protein == "")) {
dflog2fcraw <- dflog2fcraw[-which(dflog2fcraw$Protein == ""), ]
if(verbose) message("--- Empty ID proteins removed from log2fc data")
}
dflog2fcraw$Protein <- gsub("(sp\\|)(.*)(\\|.*)", "\\2", dflog2fcraw$Protein)
dflog2fcraw$Protein <- gsub("(.*)(\\|.*)", "\\1", dflog2fcraw$Protein)
dflog2fcfinites <- dflog2fcraw[is.finite(dflog2fcraw$log2FC), ]
## Removing log2fc outliers----
cutofflog2fc <- 15
if(verbose) message(
"--- Removing log2fc outliers (",
paste0("-", cutofflog2fc, " < log2fc < +", cutofflog2fc),
") "
)
filtermorethan10 <- length(dflog2fcfinites$log2FC[abs(dflog2fcfinites$log2FC) > cutofflog2fc])
if (filtermorethan10 > 0) {
if(verbose) message("------ (-) Removing [",
filtermorethan10,
"] protein ids with a abs(log2fc) >",
cutofflog2fc,
" ")
dflog2fcfinites <- dflog2fcfinites[-which(abs(dflog2fcfinites$log2FC) > cutofflog2fc), ]
}
## IMPUTING MISSING VALUES-----
# When a value is completely missed in one of the conditions,
# the log2fc = Inf / -Inf. Here, we impute those values.
# The imputation method works as follow. The assumption is that those
# proteins are likely present as well in those conditions where are missed,
# but due to the
# small sampling (usually 2 or 3 biological replicas) and other proteomics
# related issue, those proteins didn't make it through the level of detection.
# Therefore, a small intensity (sampled from the bottom 5%) will be assigned
# to the protein/site in the missing condition, and the new log2fc is
# re-calculated out of the MSstats box. Two issues are addressed in this way
# 1. If a protein has been consistently identified in one of the conditions,
# it will stay
# 2. But if the intensity value in those conditions was too low, then the
# log2fc will be also low
if(verbose) message(">> IMPUTING MISSING VALUES ")
# Select infinite values (i.e., log2fc missed for that)
dflog2fcinfinites <- dflog2fcraw[is.infinite(dflog2fcraw$log2FC), ]
numberInfinites <- dim(dflog2fcinfinites)[1]
# Control
if (numberInfinites < 1) {
if(verbose) message(" WARNING: O Infinite values (not very usual)")
} else {
if(verbose) message("--- Number of +/- INF values: ", dim(dflog2fcinfinites)[1], " ")
imputedL2FCmelted <- .artms_imputeMissingValues(dflog2fcinfinites = dflog2fcinfinites,
dfmq = dfmq)
# Merge with the original log2fc values to impute...
theImputedL2FC <- merge(dflog2fcinfinites,
imputedL2FCmelted,
by = c("Protein", "Label"),
all = FALSE)
# check size of theImputedL2FC compared with dflog2fcinfinites to see if we lost rows during imputation
# possibly we were unable to impute rows after removing outliers
if (nrow(theImputedL2FC) != nrow (dflog2fcinfinites)){
numMissing = nrow (dflog2fcinfinites) - nrow(theImputedL2FC)
message(" WARNING: Failed to impute values for ", numMissing, "rows. Possibly due to outlier removal")
}
theImputedL2FC$imputed <- "yes"
}
# Getting the data ready for merging
dflog2fcfinites$imputed <- "no"
dflog2fcfinites$iLog2FC <- dflog2fcfinites$log2FC
## Choose the pvalue or adjusted pvalue as the iPvalue-----
if (choosePvalue == "pvalue") {
dflog2fcfinites$iPvalue <- dflog2fcfinites$pvalue
} else if (choosePvalue == "adjpvalue") {
dflog2fcfinites$iPvalue <- dflog2fcfinites$adj.pvalue
} else{
stop("Only <pvalue> or <adjpvalue> for argument <choosePvalue>")
}
# Merging: NA values are thrown away at this point
if (numberInfinites < 1) {
dflog2fc <- dflog2fcfinites
} else {
dflog2fc <- rbind(dflog2fcfinites, theImputedL2FC)
}
## Plot pvalue distributions-----
if(plotPvaluesLog2fcDist){
if(verbose) message("--- Plotting distributions of log2fc and pvalues ")
plotDFdistColor <-
ggplot(dflog2fc, aes(x = log2FC, fill = Label)) +
geom_histogram(bins = 100,
alpha = .4,
col = "black",
na.rm = TRUE) +
labs(title = "Distribution log2FC", x = "log2FC")
plotDFdistAll <- ggplot(dflog2fc, aes(x = log2FC)) +
geom_histogram(bins = 100,
alpha = .4,
col = "black",
na.rm = TRUE) +
labs(title = "Distribution log2FC", x = "log2FC")
plotDFdistiLog <- ggplot(dflog2fc, aes(x = iLog2FC)) +
geom_histogram(bins = 100,
alpha = .4,
col = "black",
na.rm = TRUE) +
labs(title = "Distribution ilog2FC (imputed + nonimputed", x = "iLog2FC")
plotPvalues <-
ggplot(dflog2fc[is.finite(dflog2fc$pvalue), ], aes(x = pvalue)) +
geom_histogram(bins = 50,
alpha = .4,
col = "black",
na.rm = TRUE) +
labs(title = "Distribution p-values", x = "p-values")
plotAdjustedPvalues <-
ggplot(dflog2fc[-which(dflog2fc$adj.pvalue == 0), ], aes(x = adj.pvalue)) +
geom_histogram(bins = 150,
alpha = .4,
col = "black",
na.rm = TRUE) +
labs(title = "Distribution adj.pvalues", x = "adj.values")
plotAdjustedIpvalues <- ggplot(dflog2fc, aes(x = iPvalue)) +
geom_histogram(bins = 150,
alpha = .4,
col = "black",
na.rm = TRUE) +
labs(title = "Distribution imputed p-values", x = "iPvalues")
# DISTRIBUTION PRINT OUTS
distributionsOut <- gsub(".txt", ".distributions.pdf", log2fc_file)
distributionsOut <- paste0(output_dir, "/", distributionsOut)
if(printPDF) pdf(distributionsOut)
print(plotDFdistColor)
print(plotDFdistAll)
print(plotDFdistiLog)
print(plotPvalues)
print(plotAdjustedPvalues)
print(plotAdjustedIpvalues)
if (numberInfinites > 0) {
hist(
imputedL2FCmelted$iLog2FC,
breaks = 100,
main = paste0("Imputed Log2FC (all)\n n = ", dim(imputedL2FCmelted)[1]),
xlab = "log2fc"
)
hist(
theImputedL2FC$iLog2FC,
breaks = 100,
main = paste0("Imputed Log2FC merged\n n = ", dim(theImputedL2FC)[1]),
xlab = "log2fc"
)
}
hist(
dflog2fcfinites$pvalue,
breaks = 100,
main = paste0("p-value distribution\n n = ", dim(dflog2fcfinites)[1]),
xlab = "adj.pvalues"
)
hist(
dflog2fcfinites$adj.pvalue,
breaks = 100,
main = paste0("Adjusted p-values distribution\n n = ",
dim(dflog2fcfinites)[1]),
xlab = "adj.pvalues"
)
hist(
dflog2fcfinites$iLog2FC,
breaks = 1000,
main = paste0(
"Non-imputed Log2FC distribution\n n = ",
dim(dflog2fcfinites)[1]
),
xlab = "log2FC"
)
hist(
dflog2fc$iPvalue,
breaks = 100,
main = paste0(
"(Imputed+NonImputed) adjusted pvalue distribution\n n = ",
dim(dflog2fc)[1]
),
xlab = "adj.pvalues"
)
hist(
dflog2fc$iLog2FC,
breaks = 1000,
main = paste0(
"(Imputed+NonImputed) log2fc distribution\n n = ",
dim(dflog2fc)[1]
),
xlab = "log2FC"
)
if(printPDF) garbage <- dev.off()
}
# Plot correlations based on log2fc------
if(plotCorrQuant){
if(verbose) message(">> PLOT: CORRELATION BETWEEN QUANTIFICATIONS (based on log2fc values)")
if (length(unique(dflog2fc$Label)) > 1) {
relaChanges <- gsub(".txt", ".correlationQuantifications.pdf", log2fc_file)
relaChanges <- paste0("plot.", relaChanges)
relaChanges <- paste0(output_dir, "/", relaChanges)
if(printPDF) pdf(relaChanges)
.artms_plotRatioLog2fc(dflog2fc, verbose = verbose)
if(printPDF) garbage <- dev.off()
} else{
if(verbose) message("--- Only one Comparison is available (correlation is not possible) ")
}
}
# ABUNDANCE-----
## Relationship between conditions------
# Get the number of biological replicas based on the first condition
theConditions <- unique(dfmq$GROUP)
theFirstCond <- theConditions[2]
condFirst <- dfmq[which(dfmq$GROUP == theFirstCond), ]
theBiologicalReplicas <- unique(condFirst$SUBJECT)
numberBioReplicas <- length(theBiologicalReplicas)
## ABUNDANCE PLOTS----
if(plotAbundanceStats){
if(verbose) message(">> PLOTS: ABUNDANCE PLOTS ")
abundancesName <- gsub(".txt", ".relativeABUNDANCE.pdf", log2fc_file)
abundancesName <- paste0("plot.", abundancesName)
abundancesName <- paste0(output_dir, "/", abundancesName)
if(printPDF) pdf(abundancesName)
.artms_plotAbundanceBoxplots(df = dfmq)
.artms_plotNumberProteinsAbundance(df = dfmq)
if(printPDF) garbage <- dev.off()
}
## Reproducibility plots-----
if(plotReproAbundance){
if(verbose) message(">> PLOTS: REPRODUCIBILITY PLOTS ")
reproName <- gsub(".txt", ".reproducibilityAbundance.pdf", log2fc_file)
reproName <- paste0("plot.", reproName)
reproName <- paste0(output_dir, "/", reproName)
if(printPDF) pdf(reproName)
.artms_plotReproducibilityAbundance(x = dfmq, verbose = verbose)
if(printPDF) garbage <- dev.off()
}
## Correlation between Conditions------
if(plotCorrConditions){
if(verbose) message(">> PLOT: CORRELATION BETWEEN ALL COMPARISONS ")
relaCond <- gsub(".txt", ".correlationConditions.pdf", log2fc_file)
relaCond <- paste0("plot.", relaCond)
relaCond <- paste0(output_dir, "/", relaCond)
if(printPDF) pdf(relaCond)
.artms_plotCorrelationConditions(x = dfmq,
numberBiologicalReplicas = numberBioReplicas)
if(printPDF) garbage <- dev.off()
}
## ABUNDANCE DATA, CREATE FILTERS-----
abundance <- .artms_loadModelqcBasic(dfmq)
names(abundance)[grep('Protein', names(abundance))] <- 'Prey'
names(abundance)[grep('Condition', names(abundance))] <- 'Bait'
# TECHNICAL REPLICAS: if there are technical replicas means that we will find
# two values for the same protein in the same bioreplica, therefore we need to
# aggregate first just in case:
##LEGACY
# abundance <- aggregate(Abundance ~ Prey + Bait + BioReplicate,
# data = abundance,
# FUN = mean)
# abundance_dcmean <- data.table::dcast(abundance,
# Prey ~ Bait,
# value.var = 'Abundance',
# fun.aggregate = mean,
# fill = 0)
# Let's aggregate to get the sum of the abundance, we will use it later.
# abundance_dcsum <- data.table::dcast(abundance,
# Prey ~ Bait,
# value.var = 'Abundance',
# fun.aggregate = sum,
# fill = 0)
abundance_dcsum <- abundance %>% pivot_wider(names_from = Bait,
id_cols = Prey,
values_from = Abundance,
values_fn = list(Abundance = sum))
abundance_dcmean <- abundance %>% pivot_wider(names_from = Bait,
id_cols = Prey,
values_from = Abundance,
values_fn = list(Abundance = mean))
# Melt again the sum and mean
##LEGACY
# abundancelongsum <- data.table::melt(abundance_dcsum,
# id.vars = c('Prey'),
# value.name = 'Abundance',
# variable.name = 'Bait')
#
# abundancelongmean <- data.table::melt(abundance_dcmean,
# id.vars = c('Prey'),
# value.name = 'Abundance',
# variable.name = 'Bait')
abundancelongsum <- abundance_dcsum %>% pivot_longer(cols = -Prey,
names_to = "Bait",
values_to = "AbSum")
abundancelongmean <- abundance_dcmean %>% pivot_longer(cols = -Prey,
names_to = "Bait",
values_to = "AbMean")
# We dont need the 0 values
abundancelongsum <- abundancelongsum[!(abundancelongsum$AbSum == 0), ]
abundancelongmean <- abundancelongmean[!(abundancelongmean$AbMean == 0), ]
abundancelongsum <- abundancelongsum[!(is.na(abundancelongsum$AbSum)), ]
abundancelongmean <- abundancelongmean[!( is.na(abundancelongmean$AbMean) ), ]
abundancelongsummean <- merge(abundancelongsum, abundancelongmean, by = c('Prey', 'Bait'))
remove(abundancelongsum, abundancelongmean, abundance_dcsum, abundance_dcmean)
# REPRODUCIBILITY AND SPECIFICY PARATEMERS
# Get the number of bioreplicates based on abundance data
##LEGACY
# nbr_wide <- data.table::dcast(abundance,
# Prey ~ Bait,
# value.var = 'Abundance',
# fun.aggregate = length,
# fill = 0)
# nbr_long <- data.table::melt(nbr_wide,
# id.vars = c('Prey'),
# value.name = 'Abundance',
# variable.name = 'Bait')
#
# nbr_long <- nbr_long[!(nbr_long$BioRep == 0), ]
# names(nbr_long)[grep('Abundance', names(nbr_long))] <- 'BioRep'
nbr_wide <- abundance %>% pivot_wider(names_from = Bait,
id_cols = Prey,
values_from = Abundance,
values_fn = list(Abundance = length))
nbr_wide[(is.na(nbr_wide))] <- 0
nbr_long <- nbr_wide %>% pivot_longer(cols = -Prey,
names_to = "Bait",
values_to = "BioRep")
nbr_long <- nbr_long[!(nbr_long$BioRep == 0), ]
nbr_long <- nbr_long[!( is.na(nbr_long$BioRep) ), ]
# Get the number of replicates in long format
##LEGACY
# OUTreprod <- data.table::dcast(data = nbr_long, Prey ~ Bait, value.var = 'BioRep')
# OUTreprod[is.na(OUTreprod)] <- 0
OUTreprod <- abundance %>% pivot_wider(id_cols = Prey,
names_from = Bait,
values_from = Abundance,
values_fn = list(Abundance = length))
OUTreprod[is.na(OUTreprod)] <- 0
# Make a copy for conditions
OUTreproCondition <- OUTreprod
here <- dim(OUTreprod)[2]
# Make a copy to use later
bioReplicaInfo <- OUTreprod
# And get the total number of biological replicates
OUTreprod$BiorepCount <- rowSums(OUTreprod[, 2:here])
# Get whether a protein is found in all conditions
reprospec2merge <- subset(OUTreprod, select = c(Prey, BiorepCount))
##LEGACY
# OUTreproCondition <- data.table::dcast(data = nbr_long, Prey ~ Bait, value.var = 'BioRep')
# OUTreproCondition[is.na(OUTreproCondition)] <- 0
thedim <- dim(OUTreproCondition)[2]
thepreys <- subset(OUTreproCondition, select = c(Prey))
thevalues <- subset(OUTreproCondition, select = -c(Prey))
thevalues[thevalues > 0] <- 1
thevalues$CondCount <- rowSums(thevalues)
FinalReproCondition <- cbind(thepreys, thevalues)
reprocondition2merge <- subset(FinalReproCondition, select = c(Prey, CondCount))
# This version will be printed out below
OUTreprodFinal <- merge(OUTreprod, reprocondition2merge, by = 'Prey')
# PCA ANALYSIS-----
# It requires a simplified version for modelqc
if(plotPCAabundance){
if(verbose) message(">> PRINCIPAL COMPONENT ANALYSIS BASED ON ABUNDANCE ")
modelqcabundance <- .artms_loadModelQCstrict(df = dfmq,
species = species,
ptmis = isPtm,
verbose = verbose)
out.pca <- gsub(".txt", "-pca", log2fc_file)
out.pca <- paste0(output_dir, "/", out.pca)
suppressWarnings(.artms_getPCAplots(x = modelqcabundance,
filename = out.pca,
allConditions = conditions))
}
# ANNOTATIONS-----
modelqc_file_splc <- .artms_mergeAbNbr(df_input = dfmq,
repro = nbr_wide,
species = species)
if(verbose) message(">> ANNOTATIONS ")
# Now get ready for annotation
if(verbose) message("--- Abundance data ")
if (grepl("ptm", isPtm)) {
names(modelqc_file_splc)[grep('^Protein$', names(modelqc_file_splc))] <- 'Uniprot_PTM'
# Take the Protein ID, but being very careful about the fluomics labeling
modelqc_file_splc <- .artmsExtractUniprotId(x = modelqc_file_splc,
uniprotPtmColumn = "Uniprot_PTM",
newColumnName = "Protein")
suppressMessages(
modelqc_file_splc <- artmsAnnotationUniprot(modelqc_file_splc, 'Protein', species)
)
} else{
suppressMessages(
modelqc_file_splc <- artmsAnnotationUniprot(modelqc_file_splc, 'Protein', species)
)
}
if(verbose) message("--- Relative Quantifications (Log2fc) ")
if(choosePvalue == "adjpvalue"){
choosePvalue <- "adj.pvalue"
}
# Prepare output of changes
log2fc_file_splc <- .artms_mergeChangesNbr(df_input = dflog2fc,
repro = nbr_wide,
species = species,
choosePvalue = choosePvalue)
# Now get ready for annotation
if (grepl("ptm", isPtm)) {
names(log2fc_file_splc)[grep('^Protein$', names(log2fc_file_splc))] <- 'Uniprot_PTM'
# Take the Protein ID, but being very careful about the fluomics labeling
log2fc_file_splc <- .artmsExtractUniprotId(x = log2fc_file_splc,
uniprotPtmColumn = "Uniprot_PTM",
newColumnName = "Protein")
suppressMessages(
log2fc_file_splc <- artmsAnnotationUniprot(log2fc_file_splc, 'Protein', species)
)
} else{
suppressMessages(
log2fc_file_splc <- artmsAnnotationUniprot(log2fc_file_splc, 'Protein', species)
)
}
if(verbose) message(">> FILTERING CHANGES BEFORE PRINTING OUT ")
imputedDF <- dflog2fc[c('Protein',
'Label',
'log2FC',
'pvalue',
'adj.pvalue',
'imputed',
'iLog2FC',
'iPvalue'
)]
if(verbose) message("--- Merging Changes with replication data ")
imputedDF <- merge(imputedDF,
bioReplicaInfo,
by.x = 'Protein',
by.y = 'Prey',
all.x = TRUE)
if(verbose) message("--- Removing NA ")
imputedDF <- imputedDF[!is.na(imputedDF$log2FC), ]
if(verbose) message("--- Add labeling of condition more abundant in the quantification ")
imputedDF$CMA <- mapply(.artms_selectTheOneLog2fc,
imputedDF$iLog2FC,
imputedDF$Label)
if(verbose) message("--- Removing proteins not found in a minimal number <",
mnbr,
"> (user selected) of biological replicates ")
before <- dim(imputedDF)[1]
imputedDF <- .artms_RemoveProtBelowThres(imputedDF, mnbr)
after <- dim(imputedDF)[1]
total_remove <- before - after
if(verbose) message("\t", total_remove, " proteins removed (i.e., not found in more than <", mnbr, "> biological replicates in at least one condition)")
if(verbose) message("--- Filtering is done! ")
# PLOTS: GENERATING QC PLOTS ABOUT CHANGES-----
if(plotFinalDistributions){
if(verbose) message(">> GENERATING QC PLOTS ABOUT CHANGES (log2fc) ")
if(verbose) message("--- Distribution of log2fc and pvalues ")
distributionsFilteredOut <- gsub(".txt", ".distributionsFil.pdf", log2fc_file)
distributionsFilteredOut <- paste0(output_dir, "/", distributionsFilteredOut)
if(printPDF) pdf(distributionsFilteredOut)
hist(imputedDF$iLog2FC,
breaks = 1000,
main = paste0("Filtered Log2FC (>2BR)\n n = ", dim(imputedDF)[1]),
xlab = "log2fc")
hist(imputedDF$iPvalue,
breaks = 1000,
main = paste0("Filtered p-values (>2BR)\n n = ", dim(imputedDF)[1]),
xlab = "p-value")
if(printPDF) garbage <- dev.off()
}
if(plotPropImputation){
if(verbose) message("--- Proportion imputed values ")
# Stats about imputed values
yesimputed <- dim(imputedDF[which(imputedDF$imputed == 'yes'), ])[1]
nonimputed <- dim(imputedDF[which(imputedDF$imputed == 'no'), ])[1]
dat <- data.frame(count = c(yesimputed, nonimputed),
category = c("Imputed", "Non-Imputed"))
# Add addition columns, needed for drawing with geom_rect.
dat$fraction = dat$count / sum(dat$count)
dat <- dat[order(dat$fraction),]
dat$ymax <- cumsum(dat$fraction)
dat$ymin <- c(0, head(dat$ymax, n = -1))
p1 <- ggplot(dat, aes(fill = category,
ymax = ymax,
ymin = ymin,
xmax = 4,
xmin = 3)) +
geom_rect(na.rm = TRUE) +
coord_polar(theta = "y") +
xlim(c(0, 4)) +
labs(title = "Proportion of Imputed Intensity values")
p2 <- ggplot(dat, aes(x = category, y = count, fill = category)) +
geom_bar(stat = "identity",
na.rm = TRUE) +
labs(title = "Proportion of Imputed Intensity values")
outImputation <- gsub(".txt", ".imputation.pdf", log2fc_file)
outImputation <- paste0(output_dir, "/", outImputation)
if(printPDF) pdf(outImputation)
print(p1)
print(p2)
if(printPDF) garbage <- dev.off()
}
if(plotHeatmapsChanges){
if(verbose) message(">> HEATMAPS OF CHANGES (log2fc) ")
##LEGACY
# l2fcol <- data.table::dcast(data = imputedDF,
# Protein ~ Label,
# value.var = 'iLog2FC')
l2fcol <- imputedDF %>% tidyr::pivot_wider(id_cols = Protein,
names_from = Label,
values_from = iLog2FC)
l2fcol <- as.data.frame(l2fcol)
rownames(l2fcol) <- l2fcol$Protein
l2fcol <- within(l2fcol, rm(Protein))
l2fcol[is.na(l2fcol)] <- 0
if (numberConditions > 1) {
l2fcolmatrix <- data.matrix(l2fcol)
if(verbose) message("--- All changes ")
if(printPDF){
outHeatMapOverallL2fc <- gsub(".txt",
".clustering.log2fc.all-overview.pdf",
log2fc_file)
outHeatMapOverallL2fc <- paste0(output_dir, "/", outHeatMapOverallL2fc)
plotPhLog1 <- pheatmap(
l2fcolmatrix,
filename = outHeatMapOverallL2fc,
cellwidth = 20,
main = "Clustering Log2FC",
cluster_cols = FALSE,
clustering_method = "average",
fontfamily = "Helvetica",
show_colnames = FALSE,
fontsize = 6,
fontsize_row = 3,
fontsize_col = 10,
border_color = NA
)
print(plotPhLog1)
}else{
plotPhLog2 <- pheatmap(
l2fcolmatrix,
cellwidth = 20,
main = "Clustering Log2FC",
cluster_cols = FALSE,
clustering_method = "average",
fontfamily = "Helvetica",
show_colnames = FALSE,
fontsize = 6,
fontsize_row = 3,
fontsize_col = 10,
border_color = NA
)
print(plotPhLog2)
}
# Only significant pvalues
if(verbose) message("--- Only significant changes ")
imputedDFsig <- imputedDF[which(imputedDF$iPvalue < 0.05), ]
##LEGACY
# l2fcolSignificants <- data.table::dcast(data = imputedDFsig,
# Protein ~ Label,
# value.var = 'iLog2FC')
l2fcolSignificants <- imputedDFsig %>% tidyr::pivot_wider(id_cols = Protein,
names_from = Label,
values_from = iLog2FC)
l2fcolSignificants <- as.data.frame(l2fcolSignificants)
rownames(l2fcolSignificants) <- l2fcolSignificants$Protein
l2fcolSignificants <- within(l2fcolSignificants, rm(Protein))
l2fcolSignificants[is.na(l2fcolSignificants)] <- 0
l2fcolSignificantsmatrix <- data.matrix(l2fcolSignificants)
if(printPDF){
outHeatMapOverallL2fc <- gsub(".txt",
".clustering.log2fcSign.all-overview.pdf",
log2fc_file)
outHeatMapOverallL2fc <- paste0(output_dir, "/", outHeatMapOverallL2fc)
plotPh1 <- pheatmap(
l2fcolSignificantsmatrix,
filename = outHeatMapOverallL2fc,
cellwidth = 20,
main = "Clustering Log2FC (p-value < 0.05)",
cluster_cols = FALSE,
fontfamily = "Helvetica",
labels_row = "",
fontsize = 6,
fontsize_row = 8,
fontsize_col = 8,
border_color = NA,
fontfamily = "Helvetica")
print(plotPh1)
}else{
plotPh2 <- pheatmap(
l2fcolSignificantsmatrix,
cellwidth = 20,
main = "Clustering Log2FC (p-value < 0.05)",
cluster_cols = FALSE,
fontfamily = "Helvetica",
labels_row = "",
fontsize = 6,
fontsize_row = 8,
fontsize_col = 8,
border_color = NA,
fontfamily = "Helvetica")
print(plotPh2)
}
}
}
# ENRICHMENT OF MOST ABUNDANT PROTEINS (from IMPUTED LOG2FC values)-----
# Let's select first significance based on pvalue, by using the iPvalue
# we are already including the imputed pvalues...
##LEGACY
# l2fcol4enrichment <- data.table::dcast(data = imputedDF[which(imputedDF$iPvalue < 0.05), ],
# Protein ~ Label,
# value.var = 'iLog2FC')
l2fcol4enrichment <- imputedDF %>%
dplyr::filter(iPvalue < 0.05) %>%
tidyr::pivot_wider(id_cols = Protein,
names_from = Label,
values_from = iLog2FC)
l2fcol4enrichment <- as.data.frame(l2fcol4enrichment)
## GPROFILER=====
if (enrich == TRUE & dim(l2fcol4enrichment)[1] > 0) {
if(verbose) message(">> ENRICHMENT ANALYSIS OF SELECTED CHANGES USING GPROFILER ")
if (dim(l2fcol4enrichment)[1] > 0) {
# Let's melt now for enrichment analysis
##LEGACY
# l2fcol4enrichment <- data.table::melt(data = l2fcol4enrichment,
# id.vars = c('Protein'),
# variable.name = "Comparisons")
l2fcol4enrichment <- l2fcol4enrichment %>%
tidyr::pivot_longer(cols = -Protein,
names_to = "Comparisons",
values_to = "value")
l2fcol4enrichment <- as.data.frame(l2fcol4enrichment)
l2fcol4enrichment <- l2fcol4enrichment[complete.cases(l2fcol4enrichment), ]
# Now get ready for annotation
if (grepl("ptm", isPtm)) {
names(l2fcol4enrichment)[grep('^Protein$', names(l2fcol4enrichment))] <- 'Uniprot_PTM'
# Take the Protein ID, but being very careful about the fluomics labeling
l2fcol4enrichment <- .artmsExtractUniprotId(x = l2fcol4enrichment,
uniprotPtmColumn = "Uniprot_PTM",
newColumnName = "Protein")
suppressMessages(
l2fcol4enrichment <- artmsAnnotationUniprot(l2fcol4enrichment,
'Protein',
species)
)
} else{
suppressMessages(
l2fcol4enrichment <- artmsAnnotationUniprot(l2fcol4enrichment,
'Protein',
species)
)
}
}
if (grepl("ptm", isPtm)) {
# l2fcol4enrichment <-
# within(l2fcol4enrichment, rm(Gene,Uniprot_PTM,Protein.names))
# # Remove parties for enrichment
# l2fcol4enrichment <- l2fcol4enrichment[grep(",", l2fcol4enrichment$Protein, invert = TRUE),]
# Select the Uniprot ID, but keep in mind that some of them might
# have many _ph54_ph446 before
# l2fcol4enrichment$Protein <-
# gsub("^(\\S+?)_.*", "\\1", l2fcol4enrichment$Protein, perl = TRUE)
l2fcol4enrichment <- unique(l2fcol4enrichment[c("Protein", "Gene", "Comparisons", "value")])
}
# ALL SIGNIFICANT CHANGES log2fc
# GPROFILER
if(verbose) message("1) Enrichment of ALL significant Changes ")
filallsig_log2fc_long <- l2fcol4enrichment[which(abs(l2fcol4enrichment$value) >= l2fc_thres),]
if (dim(filallsig_log2fc_long)[1] > 0) {
out.mac.allsig <- gsub(".txt", "-enrich-MAC-allsignificants.txt", log2fc_file)
out.mac.allsig <- paste0(output_dir, "/", out.mac.allsig)
mac.allsig <- NULL
tryCatch(
mac.allsig <- artmsEnrichLog2fc(
dataset = filallsig_log2fc_long,
output_name = out.mac.allsig,
species = species,
heatmaps = TRUE,
background = listOfGenes,
verbose = verbose
), error = function(e){
message("\n\t(Error): Enrichment is not possible! ")
message("\tgProfiler server night be down or your internet connection is not working")
enrich <- FALSE
}
)
if(!is.null(mac.allsig)){
if (dim(mac.allsig)[1] > 0) {
write.table(
mac.allsig,
out.mac.allsig,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
}
if(verbose) message("---+ Corum Protein Complex Enrichment Analysis ")
# CORUM
allsigComplexEnriched <- .artms_enrichForComplexes(df = filallsig_log2fc_long,
backgroundNumber = backgroundNumber)
if (dim(allsigComplexEnriched)[1] > 0) {
out.mac.allsig.corum <- gsub(".txt",
"-enrich-MAC-allsignificants-corum.txt",
log2fc_file)
out.mac.allsig.corum <- paste0(output_dir, "/", out.mac.allsig.corum)
write.table(
allsigComplexEnriched,
out.mac.allsig.corum,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
# And the heatmap
if (dim(allsigComplexEnriched)[1] > 2) {
out.mac.allsig.corum.pdf <- gsub(".txt",
"-enrich-MAC-allsignificants-corum.pdf",
log2fc_file)
out.mac.allsig.corum.pdf <- paste0(output_dir, "/", out.mac.allsig.corum.pdf)
.artms_plotCorumEnrichment(x = allsigComplexEnriched,
outfile = out.mac.allsig.corum.pdf,
theTitle = "MAC ALL SIGNIFICANT Protein Complex Enrichment")
} else{
message("--- (-) Not enough negative corum complexes to plot ")
}
} else{
message(" ----(-) No significant hits ")
mac.allsig <- NULL
allsigComplexEnriched <- NULL
}
# POSITIVE log2fc
# GPROFILER
if(verbose) message("2) Enrichment of selected POSITIVE significant changes ")
filpos_log2fc_long <- l2fcol4enrichment[which(l2fcol4enrichment$value >= l2fc_thres),]
if (dim(filpos_log2fc_long)[1] > 0) {
out.mac.pos <- gsub(".txt", "-enrich-MAC-positives.txt", log2fc_file)
out.mac.pos <- paste0(output_dir, "/", out.mac.pos)
mac.pos <- NULL
tryCatch(
mac.pos <- artmsEnrichLog2fc(
dataset = filpos_log2fc_long,
species = species,
heatmaps = TRUE,
output_name = out.mac.pos,
background = listOfGenes,
verbose = verbose
), error = function(e){
message("\n\t(Error): Enrichment is not possible! ")
message("\tgProfiler server night be down or your internet connection is not working")
enrich = FALSE
}
)
if(!is.null(mac.pos)){
if (dim(mac.pos)[1] > 0) {
write.table(
mac.pos,
out.mac.pos,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
}
if(verbose) message("---+ Corum Protein Complex Enrichment Analysis ")
# CORUM
positiveComplexEnriched <- .artms_enrichForComplexes(filpos_log2fc_long, backgroundNumber)
if (dim(positiveComplexEnriched)[1] > 0) {
out.mac.pos.corum <-
gsub(".txt",
"-enrich-MAC-positives-corum.txt",
log2fc_file)
out.mac.pos.corum <-
paste0(output_dir, "/", out.mac.pos.corum)
write.table(
positiveComplexEnriched,
out.mac.pos.corum,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
# And the heatmap
if (dim(positiveComplexEnriched)[1] > 2) {
out.mac.pos.corum.pdf <- gsub(".txt",
"-enrich-MAC-positives-corum.pdf",
log2fc_file)
out.mac.pos.corum.pdf <- paste0(output_dir, "/", out.mac.pos.corum.pdf)
# out.mac.pos.corum.pdf <- 'whatever.corum.positive.pdf'
.artms_plotCorumEnrichment(
positiveComplexEnriched,
out.mac.pos.corum.pdf,
"MAC+ Protein Complex Enrichment"
)
} else{
if(verbose) message("\t----(-) Not enough positive corum complexes to plot ")
}
} else{
if(verbose) message("\t ------ Nothing is significant in the selected Positive log2fc ")
mac.pos <- NULL
positiveComplexEnriched <- NULL
}
# NEGATIVE log2fc
if(verbose) message("3) Enrichment of selected NEGATIVE significant changes ")
filneg_log2fc_long <- l2fcol4enrichment[which(l2fcol4enrichment$value <= -l2fc_thres),]
if (dim(filneg_log2fc_long)[1] > 0) {
out.mac.neg <- gsub(".txt", "-enrich-MAC-negatives.txt", log2fc_file)
out.mac.neg <- paste0(output_dir, "/", out.mac.neg)
mac.neg <- NULL
tryCatch(
mac.neg <- artmsEnrichLog2fc(
dataset = filneg_log2fc_long,
output_name = out.mac.neg,
species = species,
heatmaps = TRUE,
background = listOfGenes,
verbose = verbose),
error = function(e){
message("\n\t(Error): Enrichment is not possible! ")
message("\tgProfiler server night be down or your internet connection is not working")
enrich <- FALSE
})
if(!is.null(mac.neg)){
if (dim(mac.neg)[1] > 0) {
write.table(
mac.neg,
out.mac.neg,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
}
if(verbose) message("---+ Corum Protein Complex Enrichment Analysis ")
negativesComplexEnriched <- .artms_enrichForComplexes(filneg_log2fc_long, backgroundNumber)
if (dim(negativesComplexEnriched)[1] > 0) {
out.mac.neg.corum <- gsub(".txt",
"-enrich-MAC-negatives-corum.txt",
log2fc_file)
out.mac.neg.corum <- paste0(output_dir, "/", out.mac.neg.corum)
write.table(
negativesComplexEnriched,
out.mac.neg.corum,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
}
# And the heatmap
if (dim(negativesComplexEnriched)[1] > 2) {
out.mac.neg.corum.pdf <- gsub(".txt",
"-enrich-MAC-negatives-corum.pdf",
log2fc_file)
out.mac.neg.corum.pdf <- paste0(output_dir, "/", out.mac.neg.corum.pdf)
.artms_plotCorumEnrichment(x = negativesComplexEnriched,
outfile = out.mac.neg.corum.pdf,
theTitle = "MAC- Protein Complex Enrichment")
} else{
if(verbose) message("\t-----(-) Not enough negative corum complexes to plot ")
}
} else{
if(verbose) message("\t------ Nothing is significant in the NEGATIVE site of things ")
mac.neg <- NULL
negativesComplexEnriched <- NULL
}
} else{
if(verbose) message(">> NO ENRICHMENT of CHANGES (log2fc) SELECTED ")
mac.allsig <- NULL
mac.pos <- NULL
mac.neg <- NULL
}
# Superunified data version-----
superunified <- merge(abundancelongsummean,
nbr_long,
by = c('Bait', 'Prey'),
all = TRUE)
superunified <- merge(superunified,
reprocondition2merge,
by = 'Prey',
all = TRUE)
superunified <- merge(superunified,
reprospec2merge,
by = 'Prey',
all = TRUE)
if (grepl("ptm", isPtm)) {
names(superunified)[grep('^Prey$', names(superunified))] <- 'Uniprot_PTM'
# Take the Protein ID, but being very careful about the fluomics labeling
superunified <- .artmsExtractUniprotId(x = superunified,
uniprotPtmColumn = "Uniprot_PTM",
newColumnName = "Prey")
}
suppressMessages(
superunified <- artmsAnnotationUniprot(superunified, 'Prey', species)
)
# Rename (before it was just a lazy way to use another code)
names(superunified)[grep('Bait', names(superunified))] <- 'Condition'
# ANNOTATE SPECIE
if(verbose) message("--- Annotating species(s) in files ")
superunified <- artmsAnnotateSpecie(superunified, pathogen, species)
if (grepl("ptm", isPtm)) {
if(verbose) message(">> GENERATING EXTENDED DETAILED VERSION OF PH-SITE ")
imputedDFext <- artmsGeneratePhSiteExtended(df = imputedDF,
pathogen = pathogen,
species = species,
ptmType = isPtm,
output_name = paste0(output_dir, "/", log2fc_file))
}
# Final OUTPUT FILES------
if(verbose) message(">> GENERATING FINAL OUTPUT FILES ")
if (grepl("ptm", isPtm)) {
names(imputedDF)[grep('Protein', names(imputedDF))] <- 'Uniprot_PTM'
imputedDF$UniprotID <- imputedDF$Uniprot_PTM
# The virus labeling has to be taken into account
# when getting the uniprot id:
imputedDF <- .artmsExtractUniprotId(x = imputedDF,
uniprotPtmColumn = "Uniprot_PTM",
newColumnName = "UniprotID")
suppressMessages(
imputedDF <- artmsAnnotationUniprot(imputedDF,
'UniprotID',
species)
)
names(imputedDF)[grep("Label", names(imputedDF))] <- 'Comparison'
imputedDF <- artmsAnnotateSpecie(imputedDF, pathogen, species)
# Wide version of imputedDF
## LEGACY NOT TESTED!
# imputedDF_wide_log2fc <- data.table::dcast(
# data = imputedDF,
# Gene + Protein + EntrezID + Uniprot_PTM ~ Comparison,
# value.var = 'iLog2FC',
# fill = 0
# )
imputedDF_wide_log2fc <- imputedDF %>%
tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID, Uniprot_PTM),
names_from = Comparison,
values_from = iLog2FC)
##LEGACY NOT TESTED!
# imputedDF_wide_pvalue <-
# data.table::dcast(
# data = imputedDF,
# Gene + Protein + EntrezID + Uniprot_PTM ~ Comparison,
# value.var = 'iPvalue',
# fill = 0
# )
imputedDF_wide_pvalue <- imputedDF %>%
tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID, Uniprot_PTM),
names_from = Comparison,
values_from = iPvalue)
} else if (isPtm == "global") {
suppressMessages(
imputedDF <- artmsAnnotationUniprot(x = imputedDF,
columnid = 'Protein',
species = species)
)
names(imputedDF)[grep("Label", names(imputedDF))] <- 'Comparison'
imputedDF <- artmsAnnotateSpecie(imputedDF, pathogen, species)
# Wide version of imputedDF
##LEGACY
# imputedDF_wide_log2fc <- data.table::dcast(data = imputedDF,
# Gene + Protein + EntrezID ~ Comparison,
# value.var = 'iLog2FC',
# fill = 0)
imputedDF_wide_log2fc <- imputedDF %>%
tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID),
names_from = Comparison,
values_from = iLog2FC)
# LEGACY
# imputedDF_wide_pvalue <- data.table::dcast(data = imputedDF,
# Gene + Protein + EntrezID ~ Comparison,
# value.var = 'iPvalue',
# fill = 0)
imputedDF_wide_pvalue <- imputedDF %>%
tidyr::pivot_wider(id_cols = c(Gene, Protein, EntrezID),
names_from = Comparison,
values_from = iPvalue)
} else{
stop(" WRONG isPTM SELECTED.
OPTIONS AVAILABLE: global, ptmph, ptmsites ")
}
# PLOTS: boxplot of relative quantification------
if(plotTotalQuant){
if(verbose) message(">> PLOT OUT: TOTAL NUMBER OF PROTEINS/SITES QUANTIFIED")
numimputedfinal <- gsub(".txt", ".TotalQuantifications.pdf", log2fc_file)
numimputedfinal <- paste0("plot.", numimputedfinal)
numimputedfinal <- paste0(output_dir, "/", numimputedfinal)
if(printPDF) pdf(numimputedfinal)
.artms_plotNumberProteinsImputedLog2fc(imputedDF)
if(printPDF) garbage <- dev.off()
}
# CLUSTERING analysis of quantifications-----
if(plotClusteringAnalysis){
if (isPtm == "global") {
if(verbose) message(">> CLUSTERING ANALYSIS OF QUANTIFICATIONS ")
# PRE-CLUSTERING
# GET THE LIST OF SIGNIFICANTS FOR THE EXPERIMENT(S)
list_of_significants <- unique(imputedDF$Protein[which(abs(imputedDF$iLog2FC > 1) &
imputedDF$iPvalue < 0.05)])
# AND APPLY THE FILTER
data.select <- imputedDF[which(imputedDF$Protein %in% list_of_significants), ]
# GENE BASED -> heatmaps
##LEGACY
# hasdc <- data.table::dcast(data = data.select[which(data.select$imputed == "no"), ],
# Gene + Protein ~ Comparison,
# value.var = "iLog2FC",
# fun.aggregate = median,
# fill = 0)
# EXPERIMENT BASED -> PCA
# hasdcexp <- data.table::dcast(data = data.select[which(data.select$imputed == "no"), ],
# Comparison ~ Gene + Protein,
# value.var = "iLog2FC",
# fun.aggregate = median,
# fill = 0)
hasdc <- data.select %>% dplyr::filter(imputed == "no") %>%
tidyr::pivot_wider(id_cols = c(Gene, Protein),
names_from = Comparison,
values_from = iLog2FC,
values_fn = list(iLog2FC = median),
values_fill = list(iLog2FC = 0))
if(dim(hasdc)[1] > 50){
hasdcexp <- data.select %>% dplyr::filter(imputed == "no") %>%
dplyr::mutate(Gene_Protein = paste(Gene, Protein, sep = "_")) %>%
tidyr::pivot_wider(id_cols = Comparison,
names_from = Gene_Protein,
values_from = iLog2FC,
values_fn = list(iLog2FC = median),
values_fill = list(iLog2FC = 0))
hasdc <- as.data.frame(hasdc)
hasdcexp <- as.data.frame(hasdcexp)
# CLUSTERING ANALYSIS
# GENE BASED
rownames(hasdc) <- paste0(hasdc$Gene, "_", hasdc$Protein)
vamos <- within(hasdc, rm(Gene, Protein))
# if this dataset only have one comparison,
# this analysis does not makes sense: check it out:
if ( dim(vamos)[2] > 2 ) {
venga <- as.matrix(vamos)
# EXPERIMENT BASED
rownames(hasdcexp) <- hasdcexp$Comparison
vamosexp <- within(hasdcexp, rm(Comparison))
vengaexp <- as.matrix(vamosexp)
# PCA AND CORRELATION ANALYSIS
if(verbose) message("--- Correlation plots ")
df.cor.matrix <- round(cor(venga, use = "pairwise.complete.obs"), 2)
file_corr_l2fc <- gsub(".txt", ".log2fc-corr.pdf", log2fc_file)
file_corr_l2fc <- paste0(output_dir, "/", file_corr_l2fc)
if(printPDF) pdf(file_corr_l2fc, width = 12, height = 9)
corrplot::corrplot(
df.cor.matrix,
type = "upper",
tl.pos = "td",
method = "circle",
tl.cex = 0.9,
tl.col = 'black',
tl.srt = 45,
# order = "hclust",
diag = TRUE
)
PerformanceAnalytics::chart.Correlation(venga,
histogram = TRUE,
pch = 25,
main = "Correlation between Comparisons")
if(printPDF) garbage <- dev.off()
# BASED ON GROUPS
pca.hasdcexp <- FactoMineR::PCA(
hasdcexp[, -c(1)],
scale.unit = FALSE,
ncp = 4,
graph = FALSE
)
pca_all <- factoextra::fviz_pca_ind(pca.hasdcexp,
labelsize = 3,
repel = TRUE,
habillage = as.factor(hasdcexp$Comparison),
addEllipses = FALSE,
ellipse.level = 0.95
)
if(verbose) message("--- PCA, individuals plot ")
if(printPDF){
file_pca_l2fc <- gsub(".txt", ".log2fc-individuals-pca.pdf", log2fc_file)
file_pca_l2fc <- paste0(output_dir, "/", file_pca_l2fc)
pdf(file_pca_l2fc, width = 9, height = 7)
print(pca_all)
garbage <- dev.off()
} else{
print(pca_all)
}
# Determine the OPTIMAL NUMBER OF CLUSTERS:
# Elbow method
e1 <- factoextra::fviz_nbclust(venga, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
labs(subtitle = "kmeans Elbow method")
e2 <- factoextra::fviz_nbclust(venga, cluster::pam, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
labs(subtitle = "PAM Elbow method")
# Silhouette method
k1 <- factoextra::fviz_nbclust(venga, kmeans, method = "silhouette") +
labs(subtitle = "kmeans Silhouette method")
k2 <- factoextra::fviz_nbclust(venga, cluster::pam, method = "silhouette") +
labs(subtitle = "pam Silhouette method")
# Create a dendrogram
if(verbose) message("--- Dendrogram ")
res.dist <- factoextra::get_dist(vamosexp, stand = TRUE, method = "minkowski")
hc <- hclust(res.dist)
file_dendro_l2fc <- gsub(".txt", ".log2fc-dendro.pdf", log2fc_file)
file_dendro_l2fc <- paste0(output_dir, "/", file_dendro_l2fc)
if(printPDF){
pdf(file_dendro_l2fc, width = 9, height = 7)
plot(hc)
garbage <- dev.off()
}else{
plot(hc)
}
# COMPLEXHEATMAP Heatmap with a specified number of optimal clusters
n = 10
pam.res <- pam(vamos, k = n)
cp1 <- factoextra::fviz_cluster(pam.res)
cp2 <- factoextra::fviz_silhouette(pam.res, print.summary = FALSE)
if(verbose) message("--- Plots to determine optimal number of clusters ")
file_clusterplots_l2fc <- gsub(".txt", ".log2fc-clusters.pdf", log2fc_file)
file_clusterplots_l2fc <- paste0(output_dir, "/", file_clusterplots_l2fc)
if(printPDF){
pdf(file_clusterplots_l2fc, width = 9, height = 7)
print(e1)
print(e2)
print(k1)
print(k2)
print(cp1)
print(cp2)
garbage <- dev.off()
} else{
print(e1)
print(e2)
print(k1)
print(k2)
print(cp1)
print(cp2)
}
if(verbose) message("--- Cluster heatmaps (10 clusters) ")
hmap <- ComplexHeatmap::Heatmap(venga,
name = paste0("Clusters ", "(n = ", n, ")"),
col = circlize::colorRamp2(c(-3, 0, 3), c("firebrick1", "black", "olivedrab1")),
heatmap_legend_param = list(color_bar = "continuous",
legend_direction = "horizontal",
legend_width = unit(5, "cm"),
title_position = "topcenter",
title_gp = gpar(fontsize = 15, fontface = "bold")),
split = paste0("", pam.res$clustering),
row_title = "Genes",
row_title_side = "left",
row_title_gp = gpar(fontsize = 15, fontface = "bold"),
show_row_names = FALSE,
column_title = "Relative Quantifications",
column_title_side = "top",
column_title_gp = gpar(fontsize = 10, fontface = "bold"),
column_title_rot = 0,
show_column_names = TRUE,
cluster_columns = FALSE,
clustering_distance_columns = function(x) as.dist(1 - cor(t(x))),
clustering_method_columns = "ward.D2",
clustering_distance_rows = "euclidean",
clustering_method_rows = "ward.D2",
row_dend_width = unit(30, "mm"),
column_dend_height = unit(30, "mm"),
# top_annotation=colAnn,
# top_annotation_height = unit(1.75, "cm"),
# bottom_annotation=sampleBoxplot,
# bottom_annotation_height = unit(4, "cm"),
column_names_gp = gpar(fontsize = 10)
)
file_clusterheat_l2fc <- gsub(".txt", ".log2fc-clusterheatmap.pdf", log2fc_file)
file_clusterheat_l2fc <- paste0(output_dir, "/", file_clusterheat_l2fc)
if(printPDF) pdf(file_clusterheat_l2fc,
width = 12,
height = 10)
ComplexHeatmap::draw(hmap,
heatmap_legend_side = "top",
annotation_legend_side = "right")
if(printPDF) garbage <- dev.off()
# Pre enrichment of clusters
cl_number <- pam.res$clustering
dfclusters <- as.data.frame(cl_number)
dfclusters$ids <- row.names(dfclusters)
dfclusters$Gene <- gsub("(.*)(_)(.*)", "\\1", dfclusters$ids)
dfclusters$Protein <- gsub("(.*)(_)(.*)", "\\3", dfclusters$ids)
if(enrich){
if(verbose) message("--- Enrichment analysis of the clusters ")
# Making sure we have unique genes in each comparison
# (the PTM might bring redundancy)
pretmp <- dfclusters[c('Gene', 'cl_number')]
pretmp <- unique(pretmp)
tmp <- split(pretmp$Gene, pretmp$cl_number, drop = TRUE)
enrichgenes <- NULL
if (species == "human") {
tryCatch(enrichgenes <- artmsEnrichProfiler(tmp,
categorySource = c(
'GO:BP',
'GO:MF',
'GO:CC',
'KEGG',
'REAC',
'CORUM',
'HPA',
'OMIM'
),
species = 'hsapiens',
listOfGenes,
verbose = verbose),
error = function(e){
message("\n\t(Error): Enrichment is not possible! ")
message("\tgProfiler server night be down or your internet connection is not working")
enrich <- FALSE
}
)
} else if (species == "mouse") {
tryCatch(enrichgenes <- artmsEnrichProfiler(tmp,
categorySource = c('GO:BP',
'GO:MF',
'GO:CC',
'KEGG',
'REAC',
'CORUM'),
species = 'mmusculus',
listOfGenes,
verbose = verbose),
error = function(e){
message("\n\t(Error): Enrichment is not possible! ")
message("\tgProfiler server night be down or your internet connection is not working")
enrich <- FALSE
}
)
} else{
stop(species, " is currently not supported in the enrichment")
}
# Write the file
if(!is.null(enrichgenes)){
file_clusterheatenrich_l2fc <- gsub(".txt",
".log2fc-clusterheatmap-enriched.txt",
log2fc_file)
file_clusterheatenrich_l2fc <- paste0(output_dir, "/", file_clusterheatenrich_l2fc)
write.table(enrichgenes,
file_clusterheatenrich_l2fc,
col.names = TRUE,
row.names = FALSE,
sep = "\t",
quote = FALSE)
}
} #if enrich = TRUE
# Print out clusters
file_clusterheatdata_l2fc <- gsub(".txt", ".log2fc-clusterheatmap.txt", log2fc_file)
file_clusterheatdata_l2fc <- paste0(output_dir, "/", file_clusterheatdata_l2fc)
write.table(
dfclusters,
file_clusterheatdata_l2fc,
col.names = TRUE,
row.names = FALSE,
sep = "\t",
quote = FALSE
)
}
}else{
message("--- (-) Not enought data available to perform clustering analysis")
}
} # if (isPtm == "global") {
}
# End of clustering analysis
# END: WRITING OUTPUT FILES------
if(verbose) message(">> WRITING THE OUTPUT FILES")
# PRINT OUT IMPUTED
outlog2fcImpute <- gsub(".txt", "-log2fc-long.txt", log2fc_file)
outlog2fcImpute <- paste0(output_dir, "/", outlog2fcImpute)
write.table(
imputedDF,
outlog2fcImpute,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
outlog2fc <- gsub(".txt", "-log2fc-wide.txt", log2fc_file)
outlog2fc <- paste0(output_dir, "/", outlog2fc)
write.table(
log2fc_file_splc,
outlog2fc,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
outexcel <- gsub(".txt", "-summary.xlsx", log2fc_file)
outexcel <- paste0(output_dir, "/", outexcel)
if(is.null(mac.allsig) | is.null(mac.pos) | is.null(mac.neg))
enrich <- FALSE
if (enrich) {
# But now check whether is a PTM case:
if (grepl("ptm", isPtm)) {
list_of_datasets <- list(
"log2fcImputed" = imputedDF,
"log2fcImpExt" = imputedDFext,
"wide_iLog2fc" = imputedDF_wide_log2fc,
"wide_iPvalue" = imputedDF_wide_pvalue,
"enrichALL" = mac.allsig,
"enrichMACpos" = mac.pos,
"enrichMACneg" = mac.neg,
"enMACallCorum" = allsigComplexEnriched,
"enMACposCorum" = positiveComplexEnriched,
"enMACnegCorum" = negativesComplexEnriched
)
} else if (grepl("global", isPtm)) {
list_of_datasets <- list(
"log2fcImputed" = imputedDF,
"wide_iLog2fc" = imputedDF_wide_log2fc,
"wide_iPvalue" = imputedDF_wide_pvalue,
"enrichALL" = mac.allsig,
"enrich-MACpos" = mac.pos,
"enrich-MACneg" = mac.neg,
"enMACallCorum" = allsigComplexEnriched,
"enMACposCorum" = positiveComplexEnriched,
"enMACnegCorum" = negativesComplexEnriched
)
} else{
stop("Wrong option:", isPtm)
}
} else if (!enrich) {
if (grepl("ptm", isPtm)) {
list_of_datasets <- list(
"log2fcImputed" = imputedDF,
"log2fcImpExt" = imputedDFext,
"wide_iLog2fc" = imputedDF_wide_log2fc,
"wide_iPvalue" = imputedDF_wide_pvalue
)
} else if (grepl("global", isPtm)) {
list_of_datasets <- list(
"log2fcImputed" = imputedDF,
"wide_iLog2fc" = imputedDF_wide_log2fc,
"wide_iPvalue" = imputedDF_wide_pvalue
)
} else{
stop(isPtm, " is not a valid option")
}
} else{
stop("Contact developers")
# The script should have crashed by this point.
# If it gets up to here... it would be very weird
}
## EXCEL: Defining style for the header-----
hs <- openxlsx::createStyle(fontName = "Arial",
fontColour = "white",
fgFill = "#000000",
textDecoration = "Bold",
border = "Bottom")
openxlsx::write.xlsx(
list_of_datasets,
file = outexcel,
asTable = FALSE,
headerStyle = hs,
overwrite = TRUE
)
if(verbose) message("Folder <", output_dir, "> ")
if(verbose) message("- EXCEL: ", outexcel, " ")
if(verbose) message("- Log2fc Wide: ", outlog2fc, " ")
if(verbose) message("- Log2fc Impute: ", outlog2fc, " ")
if (enrich == TRUE) {
if(verbose) message("- ENRICHMENT files should also be out ")
}
if(verbose) message(">> SUPER ANALYSIS COMPLETED")
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Adding a column with the species name
#'
#' @description Adding the species name to every protein.
#' This makes more sense if there are more than one species in the dataset,
#' which must be specified in the `pathogen` option. Influenza is a special
#' case that it does not need to be specified, as far as the proteins were
#' originally annotated as `INFLUENZAGENE_STRAIN`
#' (strains covered `H1N1`, `H3N2`, `H5N1`), as for example, `NS1_H1N1`
#' @param df (data.frame) with a `Protein` column (of uniprot ids)
#' @param pathogen (char) Is there a pathogen in the dataset as well?
#' if it does not, then use `pathogen = nopathogen` (default). Supported`tb`
#' (Tuberculosis),
#' `lpn` (Legionella)
#' @param species (char) Host organism (supported for now: `human` or `mouse`)
#' @return (data.frame) The same data.frame but with an extra column
#' specifying the species
#' @keywords annotation, species
#' @examples
#' # Adding a new column with the main species of the data. Easy.
#' # But the main functionality is to add both the host-species and a pathogen,
#' # which is not illustrated in this example
#' data_with_specie <- artmsAnnotateSpecie(df = artms_data_ph_msstats_results,
#' species = "human")
#' @export
artmsAnnotateSpecie <- function(df,
pathogen = "nopathogen",
species) {
if(any(missing(df) | missing(species)))
stop("Missed (one or many) required argument(s).
Please, check the help of this function to find out more")
if (pathogen == "nopathogen") {
# Influenza is treated differently
df$Species <-
ifelse(grepl("_H1N1|_H3N2|_H5N1", df$Protein),
"Influenza",
species)
} else if (pathogen == "tb") {
pathogen.ids <- artms_data_pathogen_TB
df$Species <-
ifelse(df$Protein %in% pathogen.ids$Entry, pathogen, species)
} else if (pathogen == "lpn") {
pathogen.ids <- artms_data_pathogen_LPN
df$Species <-
ifelse(df$Protein %in% pathogen.ids$Entry, pathogen, species)
}
return(df)
}
#
#' @title Generate ph-site specific detailed file
#'
#' @description Generate extended detailed ph-site file, where every line is a
#' ph site instead of a peptide. Therefore, if one peptide has multiple ph sites
#' it will be breaking down in each of the sites. This file will help generate
#' input files for tools as [Phosfate](http://phosfate.com/) or
#' [PHOTON](https://github.com/jdrudolph/photon)
#' @param df (data.frame) of log2fc and imputed values
#' @param pathogen (char) Is there a pathogen in the dataset as well? Available
#' pathogens are `tb` (Tuberculosis), `lpn` (Legionella). If it is not,
#' then use `nopathogen` (default).
#' @param species (char) Main organism (supported for now: `human` or `mouse`)
#' @param ptmType (char) It must be a ptm-site quantification dataset. Either:
#' yes: `ptmsites` (for site specific analysis), or
#' `ptmph` (Jeff's script output evidence file).
#' @param output_name (char) A output file name (extension `.txt` required)
#' @return (data.frame) extended version of the ph-site
#' @keywords external, tools, phosfate
#' @examples \dontrun{
#' artmsGeneratePhSiteExtended(df = dfobject,
#' species = "mouse",
#' ptmType = "ptmsites",
#' output_name = log2fc_file)
#' }
#' @export
artmsGeneratePhSiteExtended <- function(df,
pathogen = "nopathogen",
species,
ptmType,
output_name) {
# #Debug
# df = artms_data_ph_msstats_results
# pathogen = "nopathogen"
# species = "human"
# ptmType = "ptmsites"
# output_name = ""
if(any(missing(df) |
missing(species) |
missing(ptmType) |
missing(output_name)))
stop("Missed (one or many) required argument(s)
Please, check the help of this function to find out more")
imputedDFext <- NULL
imputedDFext <- .artms_checkIfFile(input_file = df)
if (ptmType == "ptmph") {
# Change the Protein name to Uniprot_PTM (if it is not there already)
if(!any(grepl("Uniprot_PTM", colnames(imputedDFext))))
names(imputedDFext)[grep('^Protein$', names(imputedDFext))] <- 'Uniprot_PTM'
# Take the Protein ID, but being very careful about the fluomics labeling
imputedDFext <-
.artmsExtractUniprotId(x = imputedDFext,
uniprotPtmColumn = "Uniprot_PTM",
newColumnName = "Protein")
# Extract sites from Uniprot_PTM
imputedDFext$PTMsite <-
gsub("^(\\S+?)(_ph.*)", "\\2", imputedDFext$Uniprot_PTM, perl = TRUE)
imputedDFext$PTMsite <- gsub("^(_ph)", "", imputedDFext$PTMsite)
imputedDFext$PTMsite <- gsub("_ph", ",", imputedDFext$PTMsite)
# And create independent columns for each of them
imputedDFext <- imputedDFext %>% mutate(PTMsite = strsplit(PTMsite, ",")) %>% tidyr::unnest(PTMsite)
if( !any(grepl("Gene", colnames(imputedDFext))) ){
suppressMessages(
imputedDFext <- artmsAnnotationUniprot(imputedDFext,
'Protein',
species)
)
}
if( any(grepl("Label", colnames(imputedDFext))) ){
names(imputedDFext)[grep("^Label$", names(imputedDFext))] <- 'Comparison'
}
imputedDFext <- artmsAnnotateSpecie(imputedDFext, pathogen, species)
} else if (ptmType == "ptmsites") {
#1. Change the Protein name to Uniprot_PTM (if it is not there already)
if(!any(grepl("Uniprot_PTM", colnames(imputedDFext))))
names(imputedDFext)[grep('^Protein$', names(imputedDFext))] <-
'Uniprot_PTM'
# 2. Make a copy of Uniprot_PTM to operate on it
imputedDFext$PTMone <- imputedDFext$Uniprot_PTM
# 3. Create independent columns for each of them
imputedDFext <- imputedDFext %>%
dplyr::mutate(PTMone = strsplit(PTMone, ",")) %>%
tidyr::unnest(PTMone)
# 4. And take the labels:
imputedDFext$Protein <-
ifelse(
grepl("_H1N1|_H3N2|_H5N1", imputedDFext$PTMone),
gsub("^(\\S+?_H[1,3,5]N[1,2])_.*", "\\1",
imputedDFext$PTMone, perl = TRUE
) ,
gsub("^(\\S+?)_.*", "\\1", imputedDFext$PTMone, perl = TRUE)
)
imputedDFext$PTMaa <-
gsub("(\\S+)(_)([S,T,Y,K])(\\d+)", "\\3", imputedDFext$PTMone)
imputedDFext$PTMsite <-
gsub("(\\S+)(_)([S,T,Y,K])(\\d+)", "\\4", imputedDFext$PTMone)
imputedDFext$PTMone <- NULL
if( !any(grepl("Gene", colnames(imputedDFext))) ) {
suppressMessages(
imputedDFext <- artmsAnnotationUniprot(imputedDFext, 'Protein', species)
)
}
if( any(grepl("Label", colnames(imputedDFext))) ) {
names(imputedDFext)[grep("^Label$", names(imputedDFext))] <- 'Comparison'
}
# imputedDFext$Species <- ifelse(grepl("_H1N1|_H3N2|_H5N1",
# imputedDFext$Protein), "Influenza", species)
# imputedDFext$Species <- ifelse(imputedDFext$Protein
# %in% pathogen.ids$Entry, pathogen, species)
if( !any(grepl("Species", colnames(imputedDFext))) ){
suppressMessages(
imputedDFext <- artmsAnnotateSpecie(imputedDFext, "Protein", species)
)
}
}else{
stop( "--- (!) Only 'ptmph' or 'ptmsites' allowed for argument <ptmType> ")
}
outlog2fcImputext <- gsub(".txt", "-imputedL2fcExtended.txt", output_name)
write.table(
imputedDFext,
outlog2fcImputext,
quote = FALSE,
sep = "\t",
row.names = FALSE,
col.names = TRUE
)
return(imputedDFext)
}
#
# @title Imputing missing values
#
# @description When a value is completely missed in one of the conditions,
# the `log2fc = Inf / -Inf`. This function imputes those values, i.e.,
# will assign 'artificial' values.
# The imputation method works as follow. The assumption is that those
# proteins are likely present as well in those conditions where are found
# missed, but due to the small sampling (usually 2 or 3 biological replicas)
# and other proteomics related issues, those proteins didn't make it through
# the level of detection.
# Therefore, a small intensity (sampled from the bottom 10 intensity values)
# will be assigned to the protein/site in the missing condition,
# and the new log2fc is re-calculated out of the MSstats box.
# Two issues are addressed as follows:
# 1. If a protein has been consistently identified in one of the conditions,
# it will stay
# 2. But if the intensity value in those conditions was too low,
# then the log2fc will be also low
# @param dflog2fcinfinites data.frame of proteins with only infinite
# values from the msstats results file
# @param dfmq Abundance data, which will be used to know the details of
# reproducibility
# @return Imputed missing values
# @keywords internal, imputation, log2fc, quantifications, missing values
.artms_imputeMissingValues <- function(dflog2fcinfinites,
dfmq) {
# The comparsions
contrast <- unique(dflog2fcinfinites$Label)
# Select the IDs to impute
ids2impute <- unique(dflog2fcinfinites$Protein)
# Take the abundance values for all the proteins
abu2imp <- .artms_loadModelqcBasic(dfmq)
# Aggregate the technical replica by choosing the maximum value
abu2imp <- aggregate(Abundance ~ Protein + Condition + BioReplicate,
data = abu2imp,
FUN = mean)
# Check things that will be imputed
# dfdc.ni <- data.table::dcast(data=abu2imp2,
# Protein~BioReplicate, value.var = "Abundance")
# Two possible options here.
# 1. Select based on the bottom x%
# # Imputing the missing values by selecting randomly from the bottom 5%
# theMin <- min(dfmq$ABUNDANCE)
# # Select the 5% quartile as the maximum value to sample from
# theMax <- quantile(dfmq$ABUNDANCE, probs = .05)
#
# 2. Select the bottom 20 intensities
# Grab the bottom 30 intensities in the dataset
dfmqOrdered <- dfmq[order(dfmq$ABUNDANCE, decreasing = FALSE), ]
numberFromBottom <- 5
abuBottom <- head(dfmqOrdered$ABUNDANCE, n = numberFromBottom)
theMin <- abuBottom[1]
theMax <- abuBottom[numberFromBottom]
# Generating the numbers from which we are going to sample
numbers2sample <- seq(from = theMin, to = theMax, by = .00001)
##LEGACY
# # dcast on abundance and fill with random numbers between the minimum and q05
# suppressWarnings(
# dfdc.im <- data.table::dcast(data = abu2imp2,
# Protein ~ BioReplicate,
# value.var = "Abundance",
# fill = sample(numbers2sample,
# replace = FALSE)
# )
# )
dfdc.im <- abu2imp %>%
dplyr::select(-Condition) %>%
tidyr::pivot_wider(names_from = BioReplicate,
values_from = Abundance)
dfdc.im <- as.data.frame(dfdc.im)
dfdc.im[] <- Map(function(x, y) replace(x, is.na(x), y),
dfdc.im,
sample(numbers2sample,
size = dim(dfdc.im)[2],
replace = FALSE))
# Needs to aggregate on biological replicas
# 1. Melt on biological replicas
# Legacy
# dfdc.melt <- data.table::melt(dfdc.im,
# id.vars = c('Protein'),
# value.name = 'Abundance',
# variable.name = 'BioReplicate')
dfdc.melt <- dfdc.im %>%
tidyr::pivot_longer(cols = -c(Protein),
names_to = "BioReplicate",
values_to = "Abundance")
# 2. Get the condition
dfdc.melt$Condition <- gsub("(.*)(-)(.*)", "\\1", dfdc.melt$BioReplicate)
# 3. Dcast and Aggregate on the condition, taking the mean
##LEGACY
# dfdc.final <-
# data.table::dcast(
# data = dfdc.melt,
# Protein ~ Condition,
# value.var = 'Abundance',
# fun.aggregate = mean
# )
dfdc.final <- dfdc.melt %>%
dplyr::select(-BioReplicate) %>%
tidyr::pivot_wider(names_from = Condition,
values_from = Abundance,
values_fn = list(Abundance = mean))
# 4. Filter by proteins to impute
dfdc.final <- dfdc.final[which(dfdc.final$Protein %in% ids2impute), ]
for (c in contrast) {
# message("\t",c," --> ")
x <- gsub("(.*)(-)(.*)", "\\1", c)
y <- gsub("(.*)(-)(.*)", "\\3", c)
# message("log2fc(",x, " - ", y,") ")
# Renaming the comparision name just for illustration purposes
rnc <- paste0("l2fc_", c)
dfdc.final[[rnc]] <- dfdc.final[[x]] - dfdc.final[[y]]
}
# Select only log2fc columns
imputedL2FCValues <- dfdc.final[grepl("Protein|l2fc_", colnames(dfdc.final))]
# Melt again
# LEGACY
# imputedL2FCmelted <- data.table::melt(imputedL2FCValues,
# id.vars = c('Protein'),
# variable.name = 'Label',
# value.name = 'iLog2FC')
imputedL2FCmelted <- imputedL2FCValues %>%
tidyr::pivot_longer(cols = -Protein,
names_to = "Label",
values_to = "iLog2FC")
# Now let's get it ready for merging with the values to be
# imputed at dflog2fcinfinites
imputedL2FCmelted$Label <- gsub("l2fc_", "", imputedL2FCmelted$Label)
# And let's add p-values
samplingPvalue <- seq(from = 0.01, to = 0.05, by = .0000001)
# And add imputed pvalues
imputedL2FCmelted$iPvalue <- sample(samplingPvalue,
size = nrow(imputedL2FCmelted),
replace = FALSE)
imputedL2FCmelted <- as.data.frame(imputedL2FCmelted)
return (imputedL2FCmelted)
}
#
# @title Load limited columns from abundance (modelqc) annotated
#
# @description Load limited columns from abundance (modelqc) annotated
# @param df (data.frame) with the raw abundance data (modelqc)
# @param species (char) Species name for annotation purposes
# @param ptmis (char) Specify whether is a PTM dataset: `global`, `ptmsites`,
# `ptmph`
# @param verbose (logical) `TRUE` (default) shows function messages
# @return annotated data.frame of abundance data
# @keywords abundance, annotated
.artms_loadModelQCstrict <- function (df,
species,
ptmis,
verbose = TRUE) {
colnames(df) <- toupper(colnames(df))
# Remove empty entries
if (any(df$PROTEIN == "")) {
df <- df[-which(df$PROTEIN == ""), ]
}
df$PROTEIN <- gsub("(^sp\\|)(.*)(\\|.*)", "\\2", df$PROTEIN)
df$PROTEIN <- gsub("(.*)(\\|.*)", "\\1", df$PROTEIN)
# Technical replicas: aggregate on the mean the technical replicas
b <- aggregate(ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
data = df,
FUN = mean)
##LEGACY
# datadc <- data.table::dcast(data = b,
# PROTEIN ~ GROUP,
# value.var = 'ABUNDANCE',
# fun.aggregate = mean)
datadc <- b %>% pivot_wider(id_cols = PROTEIN,
names_from = GROUP,
values_from = ABUNDANCE,
values_fn = list(ABUNDANCE = mean))
names(datadc)[grep('PROTEIN', names(datadc))] <- 'Protein'
if (grepl("ptm", ptmis)) {
# if is a PTM dataset we don't need the real gene names for now,
# we need to use the Uniprot_ptm notation
datadc$Gene <- datadc$Protein
send_back <- datadc
} else{
suppressMessages(
send_back <- artmsAnnotationUniprot(x = datadc,
columnid = 'Protein',
species = species)
)
}
return(send_back)
}
#
# @title Load the basic ModelQC file
#
# @param x (data.frame) of the ModelQC file
# @return (data.frame) of the modelqc file with the columns Protein, Abundance,
# Condition, BioReplicate
# @keywords internal, loading
.artms_loadModelqcBasic <- function(x) {
## Not removing protein groups for now
# if (length(grep(";", x$PROTEIN)) > 0)
# x <- x[-grep(";", x$PROTEIN), ]
if ("PROTEIN" %in% colnames(x)) {
names(x)[grep("PROTEIN", names(x))] <- 'Protein'
} else{
stop("<PROTEIN> column not found")
}
if ("ABUNDANCE" %in% colnames(x)) {
names(x)[grep("ABUNDANCE", names(x))] <- 'Abundance'
} else{
stop("<ABUNDANCE> protein not found!")
}
if ("GROUP" %in% colnames(x)) {
names(x)[grep("GROUP", names(x))] <- 'Condition'
} else{
stop("<GROUP> not found")
}
if ("SUBJECT" %in% colnames(x)) {
names(x)[grep("SUBJECT", names(x))] <- 'BioReplicate'
} else{
stop("<SUBJECT> not found")
}
x <- subset(x, select = c(Protein, Abundance, Condition, BioReplicate))
return(x)
}
#
# @title Merge abundance and number of biological replicates per condition
#
# @description Merge abundance and number of biological replicates
# per condition
# @param df_input (data.frame) Abundance input file
# @param repro (data.frame) Reproducibility data.frame
# @param species (char) Species for annotation purposes
# @return (data.frame) of abundance merged with reproducibility info
# @keywords abundance, reproducibility, merging
.artms_mergeAbNbr <- function (df_input, repro, species) {
# Remove empty entries
if (any(df_input$PROTEIN == "")) {
df_input <- df_input[-which(df_input$PROTEIN == ""), ]
}
df_input$PROTEIN <- gsub("(^sp\\|)(.*)(\\|.*)", "\\2", df_input$PROTEIN)
df_input$PROTEIN <- gsub("(.*)(\\|.*)", "\\1", df_input$PROTEIN)
# TECHNICAL REPLICAS: if there are technical replicas,
# this means that we will find
# two values for the same protein in the same bioreplica,
# therefore we need to aggregate first just in case:
df_input <- aggregate(ABUNDANCE ~ PROTEIN + GROUP + SUBJECT,
data = df_input,
FUN = mean)
##LEGACY
# dc_input <- data.table::dcast( data = df_input[, c('PROTEIN', 'ABUNDANCE', 'GROUP')],
# PROTEIN ~ GROUP,
# value.var = 'ABUNDANCE',
# fun.aggregate = mean,
# fill = 0)
dc_input <- df_input %>% pivot_wider(id_cols = PROTEIN,
names_from = GROUP,
values_from = ABUNDANCE,
values_fn = list(ABUNDANCE = mean),
values_fill = list(ABUNDANCE = 0))
names(dc_input)[grep('PROTEIN', names(dc_input))] <- 'Protein'
colnames(repro) <- paste("NumBR", colnames(repro), sep = "_")
colnames(repro)[1] <- 'Protein'
dc_input <- merge(dc_input, repro, by = c('Protein'))
return(dc_input)
}
#
# @title Merge changes (log2fc) and number of biological replicates per
# condition
#
# @description Merge changes, i.e., MSstats results file of quantified changes,
# with the number of biological replicates per condition
# @param df_input Changes data.frame
# @param repro Reproducibility data.frame
# @param species Species for annotation purposes
# @param choosePvalue pvalue to use when making wider the dataframe
# @return Merged data.frame of changes and reproducibility information
# @keywords changes, log2fc, reproducibility, merging
.artms_mergeChangesNbr <- function (df_input, repro, species, choosePvalue) {
# # Remove the weird empty proteins
# if(any(df_input$Protein == "")){ df_input <-
# df_input[-which(df_input$Protein == ""),]}
# df_input$Protein <- gsub("(^sp\\|)(.*)(\\|.*)", "\\2", df_input$Protein )
# df_input$Protein <- gsub("(.*)(\\|.*)", "\\1", df_input$Protein )
##LEGACY
# input_melt = data.table::melt(data = df_input[, c('Protein',
# 'Label',
# 'log2FC',
# 'adj.pvalue'), ],
# id.vars = c('Protein', 'Label'))
input_melt <- df_input %>%
dplyr::select(c(Protein,
Label,
log2FC,
!!!choosePvalue)) %>%
tidyr::pivot_longer(cols = -c(Protein, Label),
names_to = "variable",
values_to = "value")
##LEGACY
# input_dcast <- data.table::dcast(Protein ~ Label + variable,
# data = input_melt,
# value.var = c('value'))
input_dcast <- input_melt %>%
dplyr::mutate(Label_variable = paste(Label, variable, sep = "_")) %>%
tidyr::pivot_wider(id_cols = c(Protein),
names_from = Label_variable,
values_from = value)
colnames(repro) <- paste("NumBR", colnames(repro), sep = "_")
colnames(repro)[1] <- 'Protein'
input_dcast <- merge(input_dcast, repro, by = c('Protein'))
# Move Gene name to the left:
return(input_dcast)
}
# @title Filter: Remove proteins below some threshold of minimal reproducibility
#
# @description If a protein is not found in a minimal number of
# biological replicates in at least one of the conditions, it is removed
# @param dfi (data.frame) Data.frame with biological replicates information
# @param mnbr (int) minimal number of biological replicates
# @return (data.frame) a filtered `dfi`
# @keywords internal, filter, bioreplicates, reproducibility
.artms_RemoveProtBelowThres <- function(dfi, mnbr) {
theComparisons2check <- unique(dfi$Label)
for (onlyonecomp in (theComparisons2check)) {
ax <- gsub("(.*)(-)(.*)", "\\1", onlyonecomp)
ay <- gsub("(.*)(-)(.*)", "\\3", onlyonecomp)
# If the condition is not met, i.e., if all the proteins are
# found in at least X biological replicas, then
# it would remove the whole thing.
if (dim(dfi[which((dfi[[ax]] < mnbr) & (dfi[[ay]] < mnbr)), ])[1] > 0) {
dfi <- dfi[-which(((dfi[[ax]] < mnbr) & (dfi[[ay]] < mnbr)) & (dfi$Label == onlyonecomp)), ]
}
}
return(dfi)
}
#
# @title Select and label the condition more abundant in a quantification
#
# @description Select and label the condition more abundant in a quantification
# - If log2fc > 0 the condition on the left ('numerator') is the most abundant
# - If log2fc < 0 the condition on the right ('denominator')
# is the most abundant
# @param a (char) log2fc column
# @param b (char) comparison column
# @return (char) One of the conditions from the comparison
# @keywords internal, selection, labeling
.artms_selectTheOneLog2fc <- function(a, b) {
thrs <- 0
# a should be the log2fc column
if (a > thrs) {
sb <- gsub("(.*)(-)(.*)", "\\1", b)
} else if (a < -thrs) {
sb <- gsub("(.*)(-)(.*)", "\\3", b)
} else {
sb <- 'NA'
}
return(sb)
}
#
# @title Extract Uniprot ID from a UNIPROT_PTM identifier
#
# @param x The data.frame with the uniprot_ptm column
# @param uniprotPtmColumn (char) Column in the data.frame with the Uniprot_PTM
# IDs
# @param newColumnName (char) Name of the new column that will contain the
# Uniprot ids
# @return (char) One of the conditions from the comparison
# @keywords internal, selection, labeling
.artmsExtractUniprotId <- function(x, uniprotPtmColumn, newColumnName){
if(any(missing(x) | missing(newColumnName) | missing(uniprotPtmColumn)))
stop("Missed (one or many) required argument(s)
Please, check the help of this function to find out more")
if(!is.character(newColumnName))
stop("Argument 'newColumnName' must be a character")
if(!is.character(uniprotPtmColumn))
stop("Argument 'uniprotPtmColumn' must be a character")
x[[newColumnName]] <- ifelse(
grepl("_H1N1|_H3N2|_H5N1",
x[[uniprotPtmColumn]]),
gsub("^(\\S+?_H[1,3,5]N[1,2])_.*",
"\\1",
x[[uniprotPtmColumn]],
perl = TRUE) ,
gsub("^(\\S+?)_.*",
"\\1",
x[[uniprotPtmColumn]],
perl = TRUE)
)
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.