PhosMap is a comprehensive R package for analyzing quantitative phosphoproteomics data, which provides mutltiple functions for users as follow: - (1) clustering: principal component analysis (PCA) and t-Distributed Stochastic Neighbor Embedding (t-SNE); - (2) differential expression analysis; - (3) time course analysis; - (4) kinase activity prediction to find activated/deactivated kinases from the kinase-substrate database - (5) phosphorylation motif enrichment analysis to provide clues for finding candidate kinases that are not present in the database; - (6) and data visualization.
To test the efficacy of PhosMap and help users get started quickly, we collected a dataset including 39 phosphoproteomics and 32 proteomics raw files deposited in the ProteomeXchange Consortium (Ressa, et al., 2018).The partial key intermediate results were provided for uers to master PhosMap. We have embeded intermediate results from demo into PhosMap for help users get started.
# load PhosMap library('PhosMap') # load intermediate results from https://github.com/ecnuzdd/PhosMap_datasets # manully download from https://github.com/ecnuzdd/PhosMap_datasets/blob/master/data/BRAFi.RData load("BRAFi.RData") # background_df A data frame for motif enrichment analysis as background # combined_df_with_mapped_gene_symbol Get a data frame mapped GI number to Gene Symbol # data_frame_normalization_with_control_no_pair A data frame containning phosphoproteomics data normalized by proteomics data. # foreground_df A data frame for motif enrichment analysis as foreground. # fuzzy_input_df A data frame for time course analysis as input. # group A factor for experiment group information. # merge_df_with_phospho_peptides A merged phosphoproteomics data frame based on peptides files (unique ID). # motif_group_m_ratio_df_mat A matrix for motif profile. # phospho_data_filtering_STY_and_normalization A phosphoproteomics data frame after normalization and filtering. # profiling_data_normalized A proteomics data frame after normalization and filtering. # summary_df_of_unique_proteins_with_sites A data frame that phosphorylation sites had been mapping to protein sequence and eliminated redundancy.
An intact data pre-processing procedure of phosphoproteomics data covered three main steps: merging input files after quality control, mapping phosphorylation sites (p-sites) to the corresponding protein sequence and data normalization.
'Phosphoproteomics data' and 'The phosphoproteomics experimental design' are required as input. For p-sites detected by Mascot, PhosMap could provide confidence probability of p-sites extracted from Mascot xml file. For p-sites detected by other software, a two column table including their corresponding sequences and confidence probability was indispensable. Then quality control at peptide and site levels for each experiment was performed.
BASE_DIR <- getwd() # working directory BASE_DIR <- normalizePath(BASE_DIR) phosphorylation_exp_design_info_file_path <- normalizePath(file.path(BASE_DIR, 'phosphorylation_exp_design_info.txt')) phosphorylation_peptide_dir <- normalizePath(file.path(BASE_DIR, 'phosphorylation_peptide_txt')) if(FALSE){ # if you have xml files from mascot results, you can run the cmd to parser them to text files. mascot_xml_dir <- normalizePath(file.path(BASE_DIR, 'mascot_xml')) mascot_txt_dir <- normalizePath(file.path(BASE_DIR, 'mascot_txt')) extract_psites_score(phosphorylation_exp_design_info_file_path, mascot_xml_dir, mascot_txt_dir) # Based on above-mentioned text files from Mascot results, # the following cmd can generate CSV files of phosphorylation sites with confidence score. psites_score_dir <- normalizePath(file.path(BASE_DIR, 'psites_score_txt')) generate_psites_score_file(mascot_txt_dir, phosphorylation_peptide_dir, psites_score_dir) } # Merge phosphoproteomics data based on peptides files (unique ID). # If qc = TRUE, considering confidence score of phosphorylation sites. # A merged phosphoproteomics data frame based on peptides files (unique ID). merge_df_with_phospho_peptides <- pre_process_filter_psites( phosphorylation_peptide_dir, psites_score_dir, phosphorylation_exp_design_info_file_path, qc = TRUE, min_score = 20, min_FDR = 0.01 )
# Get a data frame mapped GI number to Gene Symbol. # system.time({ # combinated_df_with_mapped_gene_symbol = get_combined_data_frame( # merge_df_with_phospho_peptides, species = 'human', id_type = 'RefSeq_Protein_GI' # ) # }) head(combined_df_with_mapped_gene_symbol)
# Assign psites to protein sequence. # Unique ID: protein_gi + phosphorylation site in protein sequence. # system.time({ # summary_df_of_unique_proteins_with_sites = get_summary_with_unique_sites( # combinated_df_with_mapped_gene_symbol, species = 'human', fasta_type = 'refseq' # ) # }) head(summary_df_of_unique_proteins_with_sites)
PhosMap provides two kinds of normalizations. 1. PhosMap allowed for a total sum scaling normalization.
# Imputation with the next order of magnitude of the minimum except for zero. # Filtering data only including phosphorylation site. phospho_data_filtering_STY_and_normalization_list <- get_normalized_data_of_psites( summary_df_of_unique_proteins_with_sites, phosphorylation_exp_design_info_file_path, topN = NA, mod_types = c('S', 'T', 'Y') ) phospho_data_filtering_STY <- phospho_data_filtering_STY_and_normalization_list$ptypes_area_df_with_id phospho_data_filtering_STY_and_normalization <- phospho_data_filtering_STY_and_normalization_list$ptypes_fot5_df_with_id head(phospho_data_filtering_STY_and_normalization)
# Based on phospho_data_filtering_STY_and_normalization ID <- paste(phospho_data_filtering_STY_and_normalization$GeneSymbol, phospho_data_filtering_STY_and_normalization$AA_in_protein, sep = '_') Value <- phospho_data_filtering_STY_and_normalization[,-seq(1,6)] phospho_data <- data.frame(ID, Value) phospho_data_rownames <- paste(phospho_data_filtering_STY_and_normalization$GI, phospho_data_filtering_STY_and_normalization$GeneSymbol, phospho_data_filtering_STY_and_normalization$AA_in_protein, sep = '_') rownames(phospho_data) <- phospho_data_rownames # Further normalize phosphoproteomics data based on proteomics data # The configurations of function see help document. data_frame_normalization_with_control_no_pair <- normalize_phos_data_to_profiling( phospho_data, profiling_data_normalized, phosphorylation_exp_design_info_file_path, profiling_exp_design_info_file_path, control_label = '0', pair_flag = FALSE ) head(data_frame_normalization_with_control_no_pair)
PhosMap incorporated four analysis modules, including clustering and differential expression analysis, time course analysis, kinase-substrate enrichment analysis to find activated/deactivated kinases and motif enrichment analysis.
# Clustering: t-SNE or PCA expr_data_frame <- data_frame_normalization_with_control_no_pair # t-SNE using all experiments visualization_with_simple_tsne(expr_data_frame, group)
- PCA
expr_ID <- as.vector(expr_data_frame[,1]) expr_Valule <- expr_data_frame[,-1] expr_Valule_mean <- NULL expr_Valule_row <- nrow(expr_Valule) for(i in 1:expr_Valule_row){ x <- as.vector(unlist(expr_Valule[i,])) x_m <- tapply(x, group, mean) expr_Valule_mean <- rbind(expr_Valule_mean, x_m) } group_levels = levels(group) colnames(expr_Valule_mean) <- group_levels expr_df <- data.frame(expr_ID, expr_Valule_mean) # PCA using mean value in group for comparison with original literature visualization_with_simple_pca(expr_df)
# Differently expressed Proteins/Genes analysis # t2 vs t0 expr_data_frame <- data_frame_normalization_with_control_no_pair[,1:17] # phosphoproteomics data normalized by proteomics data # select phosphorylation sites with greater variation expr_data_frame_var <- apply(expr_data_frame, 1, function(x){ var(x[-1]) }) index_of_kept <- which(expr_data_frame_var>1) expr_data_frame <- expr_data_frame[index_of_kept,] # group information (t0 vs t2) deps_group_levels <- c('t0', 't2') deps_group <- factor(as.vector(group)[1:16], levels = deps_group_levels)
- limma
# (1) limma limma_results_df <- analysis_deps_limma(expr_data_frame, deps_group, deps_group_levels, log2_label = FALSE, adjust_method = 'none') limma_results_df$ID <- apply(limma_results_df, 1, function(x){ x = strsplit(x, '_')[[1]] paste(x[2], x[3], sep = '_') }) visualization_deps_with_scatter(limma_results_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins \n with limma', show_text = FALSE, min_up_text = 70, min_down_text = 70)
- SAM
# (2) SAM sam_results_list <- analysis_deps_sam(expr_data_frame, deps_group, log2_label = FALSE, nperms = 100, rand = NULL, minFDR = 0.05, samr_plot = TRUE) sam_results <- rbind(sam_results_list$genes_up_df, sam_results_list$genes_down_df)
- ANOVA
# (3) annova anova_result_df <- analysis_deps_anova(expr_data_frame, deps_group, log2_label = FALSE, return_padjust = TRUE, adjust_method = 'BH') visualization_deps_with_scatter(anova_result_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins \n with anova', show_text = FALSE, min_up_text = 15, min_down_text = 15)
Fuzzy clustering was applied to time course analysis for discovering patterns associated with time points in PhosMap.The corresponding line chart combined with membership for each cluster was also drawn.
group_levels <- levels(group) # fuzzy c-means clustering set.seed(1000) fuzzy_clustObj <- visualization_fuzzycluster( fuzzy_input_df, group, group_levels, k_cluster=9, iteration = 100, mfrow = c(3,3), min_mem = 0.1, plot = TRUE ) # clusters information clusterS_info <- fuzzy_clustObj$cluster clusterS_names <- names(clusterS_info) clusters_df <- data.frame(clusterS_names, clusterS_info) # write.csv(clusters_df, 'clusters_df.csv', row.names = TRUE)
In PhosMap, three kinase activity prediction methods were included: KSEA, multiple linear regression (MLR) and Mean Value. - Data preparation
# For early and late response # early -> clusterS_info==1 # late -> clusterS_info==2 cluster_flag <- 'early' cluster_symbol <- clusterS_names[clusterS_info==1] expr_data_frame <- data_frame_normalization_with_control_no_pair index_of_cluster <- match(cluster_symbol, expr_data_frame$ID) cluster_df <- expr_data_frame[index_of_cluster,]
# Perform KSEA summary_df_list_from_ksea_cluster <- get_summary_from_ksea(cluster_df, species = 'human', log2_label = FALSE, ratio_cutoff = 3) # Activity of regulons for regulation ksea_regulons_activity_df_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df ksea_id_cluster <- as.vector(ksea_regulons_activity_df_cluster[,1]) ksea_value_cluster <- ksea_regulons_activity_df_cluster[,-1] if(FALSE){ # Pvalue of regulons for regulation ksea_regulons_pvalue_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_pvalue_df # Activity of regulons for regulation ksea_regulons_activity_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df # Expression ratio of regulons for regulation ksea_kinase_site_substrate_original_ratio_cluster <- summary_df_list_from_ksea_cluster$ksea_kinase_site_substrate_original_ratio_df } # plot pheatmap if(TRUE){ # annotation setting annotation_col <- data.frame( group = group ) rownames(annotation_col) <- colnames(ksea_value_cluster) # breaks and colors setting breaks_1 <- seq(-4, -2, 0.2) colors_1 <- colorRampPalette(c('#11264f', '#145b7d'))(length(breaks_1)-1) breaks_2 <- seq(-2, -1, 0.2) colors_2 <- colorRampPalette(c('#145b7d', '#009ad6'))(length(breaks_2)) breaks_3 <- seq(-1, 1, 0.2) colors_3 <- colorRampPalette(c('#009ad6', 'white', '#FF6600'))(length(breaks_3)) breaks_4 <- seq(1, 2, 0.2) colors_4 <- colorRampPalette(c('#FF6600', 'red'))(length(breaks_4)) breaks_5 <- seq(2, 4, 0.2) colors_5 <- colorRampPalette(c('red', 'firebrick'))(length(breaks_5)) breaks <- c(breaks_1, breaks_2, breaks_3, breaks_4, breaks_5) breaks <- breaks[which(!duplicated(breaks))] color <- c(colors_1, colors_2, colors_3, colors_4, colors_5) color <- color[which(!duplicated(color))] library(pheatmap) ph <- pheatmap( ksea_value_cluster, scale = 'none', annotation_col = annotation_col, clustering_distance_rows = 'euclidean', fontsize_row = 5, # cutree_rows = 1, show_rownames = TRUE, fontsize_col = 5, # cutree_cols = 1, cluster_cols = FALSE, border_color = 'black', cellwidth = 5, cellheight = 5, breaks = breaks, color = color, legend_breaks = c(-4, -2, -1, 0, 1, 2, 4), legend_labels = c(-4, -2, -1, 0, 1, 2, 4), main = paste('Kinase-substrate enrichment analysis', cluster_flag, sep = ' ') ) }
# get kinase activity matrix with multiple linear regression (mlr) method kinase_activity_df_mlr <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mlr')
# get kinase activity matrix with mean value method kinase_activity_df_mean <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mean')
PhosMap allowed for performing MEA on user defined phosphopeptides lists. - Data preparation
# *** foreground *** foreground_data <- phospho_data_filtering_STY_and_normalization # pre-processed data foreground_sequence <- as.vector(foreground_data$Sequence) GI <- as.vector(foreground_data$GI) Sequence <- as.vector(foreground_data$Sequence) AA_in_protein <- as.vector(foreground_data$AA_in_protein) # *** required parameters *** fixed_length <- 15 species <- 'human' motifx_pvalue <- 0.01 # get foreground data frame # foreground_df = get_aligned_seq_for_mea(ID, Sequence, AA_in_protein, fixed_length, species = 'human', fasta_type = 'refseq') # get background data frame # background_df = get_global_background_df(species = 'human', fasta_type = 'refseq')
# construct foreground and background # To facilitate testing the module, select an appropriate number of items at random. foreground <- as.vector(foreground_df$aligned_seq) foreground <- foreground[sample(length(foreground), 1000)] background <- as.vector(background_df$Aligned_Seq) background <- background[sample(length(background), 10000)] motifs_list <- mea_based_on_background(foreground, AA_in_protein, background, motifx_pvalue) # Find sequences in foreground that are mapped to specific motif foreground_sequences_mapped_to_motifs <- get_foreground_seq_to_motifs(motifs_list, foreground) # Find data frame in foreground that are mapped to specific motif foreground_df_mapped_to_motifs <- get_foreground_df_to_motifs(foreground_sequences_mapped_to_motifs, foreground, foreground_df)
# The data can be used for ploting logo of sepcific motif: foreground_sequences_mapped_to_motifs # ploting logo: Q......S....... require(ggseqlogo) ggseqlogo(foreground_sequences_mapped_to_motifs[[1]]) if(TRUE){ # batch plot and count peptides for each motif foreground_sequences_mapped_to_motifs_count <- length(foreground_sequences_mapped_to_motifs) motifs <- names(foreground_sequences_mapped_to_motifs) peptides_count <- NULL for(i in seq_len(foreground_sequences_mapped_to_motifs_count)){ l_i <- foreground_sequences_mapped_to_motifs[[i]] peptides_count <- c(peptides_count, length(l_i)) } motifs_peptides_count_df <- data.frame(motifs, peptides_count) # quantile(peptides_count, seq(0,1,0.05)) if(FALSE){ plot_seqlogo(BASE_DIR, foreground_sequences_mapped_to_motifs, plot_min_seqs = 25) } }
# Select motifs at least having 50 peptides # Assign quantitative values of peptides to their motif foreground_value <- foreground_data[,-c(seq(1,6))] min_seqs <- 50 index_of_motifs <- which(peptides_count>=min_seqs) motif_group_m_ratio_df <- NULL for(i in index_of_motifs){ motif <- motifs[i] aligned_peptides <- foreground_sequences_mapped_to_motifs[[i]] index_of_match <- match(aligned_peptides, foreground_df$aligned_seq) motif_value <- foreground_value[index_of_match,] motif_value_colsum <- colSums(motif_value) motif_group_m <- tapply(motif_value_colsum, group, mean) motif_group_m_ratio <- motif_group_m/motif_group_m[1] motif_group_m_ratio_df <- rbind(motif_group_m_ratio_df, motif_group_m_ratio) } motifs_subset <- motifs[index_of_motifs] peptides_count_subset <- peptides_count[index_of_motifs] rownames(motif_group_m_ratio_df) <- paste(motifs_subset, peptides_count_subset) # The matrix is import inot pheatmap motif_group_m_ratio_df_mat <- as.matrix(motif_group_m_ratio_df) # plot pheatmap if(TRUE){ # library(pheatmap) # breaks and colors setting breaks_1 <- seq(0, 0.5, 0.1) colors_1 <- colorRampPalette(c('green', 'blue'))(length(breaks_1)-1) breaks_3 <- seq(0.5, 1.5, 0.1) colors_3 <- colorRampPalette(c('blue', 'white', '#FFBFBF'))(length(breaks_3)) breaks_4 <- seq(1.5, 2, 0.1) colors_4 <- colorRampPalette(c('#FFBFBF', 'red'))(length(breaks_4)) breaks_5 <- seq(2, 4, 0.1) colors_5 <- colorRampPalette(c('red','firebrick'))(length(breaks_5)) breaks <- c(breaks_1, breaks_3, breaks_4, breaks_5) breaks <- breaks[which(!duplicated(breaks))] colors <- c(colors_1, colors_3, colors_4, colors_5) colors <- colors[which(!duplicated(colors))] length(breaks) length(which(!duplicated(colors))) ph <- pheatmap( motif_group_m_ratio_df_mat, scale = 'none', # annotation_col = annotation_col, clustering_distance_cols = 'euclidean', fontsize_row = 6, cutree_rows = 1, show_rownames = TRUE, cluster_rows = TRUE, fontsize_col = 6, cutree_cols = 1, show_colnames = TRUE, cluster_cols = FALSE, border_color = 'black', # color = colors, cellwidth = 12, cellheight = 12, breaks = breaks, color = colors, legend_breaks = c(0, 0.5, 1, 1.5, 2, 4), legend_labels = c(0, 0.5, 1, 1.5, 2, 4), main = 'Motif enrichment analysis' ) }
# formatting output formatted_output_df <- formatted_output_mef_results(foreground_sequences_mapped_to_motifs) # write file # write.table(formatted_output_df, 'formatted_output_df.txt', row.names = FALSE, col.names = FALSE, sep = '\t')
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.