## ----load---------------------------------------------------------------------
# load PhosMap
library('PhosMap')
# load intermediate results from https://github.com/ecnuzdd/PhosMap_datasets
# manully download from https://github.com/ecnuzdd/PhosMap_datasets/blob/master/data/BRAFi.RData
load("BRAFi.RData")
# background_df A data frame for motif enrichment analysis as background
# combined_df_with_mapped_gene_symbol Get a data frame mapped GI number to Gene Symbol
# data_frame_normalization_with_control_no_pair A data frame containning phosphoproteomics data normalized by proteomics data.
# foreground_df A data frame for motif enrichment analysis as foreground.
# fuzzy_input_df A data frame for time course analysis as input.
# group A factor for experiment group information.
# merge_df_with_phospho_peptides A merged phosphoproteomics data frame based on peptides files (unique ID).
# motif_group_m_ratio_df_mat A matrix for motif profile.
# phospho_data_filtering_STY_and_normalization A phosphoproteomics data frame after normalization and filtering.
# profiling_data_normalized A proteomics data frame after normalization and filtering.
# summary_df_of_unique_proteins_with_sites A data frame that phosphorylation sites had been mapping to protein sequence and eliminated redundancy.
## ----extract_psites_score, eval = FALSE---------------------------------------
# BASE_DIR <- getwd() # working directory
# BASE_DIR <- normalizePath(BASE_DIR)
# phosphorylation_exp_design_info_file_path <- normalizePath(file.path(BASE_DIR, 'phosphorylation_exp_design_info.txt'))
# phosphorylation_peptide_dir <- normalizePath(file.path(BASE_DIR, 'phosphorylation_peptide_txt'))
#
# if(FALSE){
# # if you have xml files from mascot results, you can run the cmd to parser them to text files.
# mascot_xml_dir <- normalizePath(file.path(BASE_DIR, 'mascot_xml'))
# mascot_txt_dir <- normalizePath(file.path(BASE_DIR, 'mascot_txt'))
# extract_psites_score(phosphorylation_exp_design_info_file_path, mascot_xml_dir, mascot_txt_dir)
#
# # Based on above-mentioned text files from Mascot results,
# # the following cmd can generate CSV files of phosphorylation sites with confidence score.
# psites_score_dir <- normalizePath(file.path(BASE_DIR, 'psites_score_txt'))
# generate_psites_score_file(mascot_txt_dir, phosphorylation_peptide_dir, psites_score_dir)
# }
# # Merge phosphoproteomics data based on peptides files (unique ID).
# # If qc = TRUE, considering confidence score of phosphorylation sites.
# # A merged phosphoproteomics data frame based on peptides files (unique ID).
# merge_df_with_phospho_peptides <- pre_process_filter_psites(
# phosphorylation_peptide_dir,
# psites_score_dir,
# phosphorylation_exp_design_info_file_path,
# qc = TRUE,
# min_score = 20,
# min_FDR = 0.01
# )
## -----------------------------------------------------------------------------
# Get a data frame mapped GI number to Gene Symbol.
# system.time({
# combinated_df_with_mapped_gene_symbol = get_combined_data_frame(
# merge_df_with_phospho_peptides, species = 'human', id_type = 'RefSeq_Protein_GI'
# )
# })
head(combined_df_with_mapped_gene_symbol)
## -----------------------------------------------------------------------------
# Assign psites to protein sequence.
# Unique ID: protein_gi + phosphorylation site in protein sequence.
# system.time({
# summary_df_of_unique_proteins_with_sites = get_summary_with_unique_sites(
# combinated_df_with_mapped_gene_symbol, species = 'human', fasta_type = 'refseq'
# )
# })
head(summary_df_of_unique_proteins_with_sites)
## ----eval = FALSE-------------------------------------------------------------
# # Imputation with the next order of magnitude of the minimum except for zero.
# # Filtering data only including phosphorylation site.
# phospho_data_filtering_STY_and_normalization_list <- get_normalized_data_of_psites(
# summary_df_of_unique_proteins_with_sites,
# phosphorylation_exp_design_info_file_path,
# topN = NA, mod_types = c('S', 'T', 'Y')
# )
# phospho_data_filtering_STY <- phospho_data_filtering_STY_and_normalization_list$ptypes_area_df_with_id
# phospho_data_filtering_STY_and_normalization <- phospho_data_filtering_STY_and_normalization_list$ptypes_fot5_df_with_id
# head(phospho_data_filtering_STY_and_normalization)
## ----eval = FALSE-------------------------------------------------------------
# # Based on phospho_data_filtering_STY_and_normalization
# ID <- paste(phospho_data_filtering_STY_and_normalization$GeneSymbol,
# phospho_data_filtering_STY_and_normalization$AA_in_protein,
# sep = '_')
# Value <- phospho_data_filtering_STY_and_normalization[,-seq(1,6)]
# phospho_data <- data.frame(ID, Value)
# phospho_data_rownames <- paste(phospho_data_filtering_STY_and_normalization$GI,
# phospho_data_filtering_STY_and_normalization$GeneSymbol,
# phospho_data_filtering_STY_and_normalization$AA_in_protein,
# sep = '_')
# rownames(phospho_data) <- phospho_data_rownames
# # Further normalize phosphoproteomics data based on proteomics data
# # The configurations of function see help document.
# data_frame_normalization_with_control_no_pair <- normalize_phos_data_to_profiling(
# phospho_data, profiling_data_normalized,
# phosphorylation_exp_design_info_file_path,
# profiling_exp_design_info_file_path,
# control_label = '0',
# pair_flag = FALSE
# )
# head(data_frame_normalization_with_control_no_pair)
## ----visualization_with_simple_tsne, fig.cap = "clustering example: t-SNE", fig.wide = TRUE----
# Clustering: t-SNE or PCA
expr_data_frame <- data_frame_normalization_with_control_no_pair
# t-SNE using all experiments
visualization_with_simple_tsne(expr_data_frame, group)
## ----visualization_with_simple_pca, fig.cap = "clustering example: PCA", fig.wide = TRUE----
expr_ID <- as.vector(expr_data_frame[,1])
expr_Valule <- expr_data_frame[,-1]
expr_Valule_mean <- NULL
expr_Valule_row <- nrow(expr_Valule)
for(i in 1:expr_Valule_row){
x <- as.vector(unlist(expr_Valule[i,]))
x_m <- tapply(x, group, mean)
expr_Valule_mean <- rbind(expr_Valule_mean, x_m)
}
group_levels = levels(group)
colnames(expr_Valule_mean) <- group_levels
expr_df <- data.frame(expr_ID, expr_Valule_mean)
# PCA using mean value in group for comparison with original literature
visualization_with_simple_pca(expr_df)
## -----------------------------------------------------------------------------
# Differently expressed Proteins/Genes analysis
# t2 vs t0
expr_data_frame <- data_frame_normalization_with_control_no_pair[,1:17] # phosphoproteomics data normalized by proteomics data
# select phosphorylation sites with greater variation
expr_data_frame_var <- apply(expr_data_frame, 1, function(x){
var(x[-1])
})
index_of_kept <- which(expr_data_frame_var>1)
expr_data_frame <- expr_data_frame[index_of_kept,]
# group information (t0 vs t2)
deps_group_levels <- c('t0', 't2')
deps_group <- factor(as.vector(group)[1:16], levels = deps_group_levels)
## ----analysis_deps_limma, fig.cap = "Differential expression analysis: limma", fig.wide = TRUE----
# (1) limma
limma_results_df <- analysis_deps_limma(expr_data_frame, deps_group, deps_group_levels, log2_label = FALSE, adjust_method = 'none')
limma_results_df$ID <- apply(limma_results_df, 1, function(x){
x = strsplit(x, '_')[[1]]
paste(x[2], x[3], sep = '_')
})
visualization_deps_with_scatter(limma_results_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins \n with limma',
show_text = FALSE, min_up_text = 70, min_down_text = 70)
## ----eval=FALSE---------------------------------------------------------------
# # (2) SAM
# sam_results_list <- analysis_deps_sam(expr_data_frame, deps_group, log2_label = FALSE, nperms = 100, rand = NULL, minFDR = 0.05, samr_plot = TRUE)
# sam_results <- rbind(sam_results_list$genes_up_df, sam_results_list$genes_down_df)
## ----eval=FALSE---------------------------------------------------------------
# # (3) annova
# anova_result_df <- analysis_deps_anova(expr_data_frame, deps_group, log2_label = FALSE, return_padjust = TRUE, adjust_method = 'BH')
# visualization_deps_with_scatter(anova_result_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins \n with anova',
# show_text = FALSE, min_up_text = 15, min_down_text = 15)
## ----visualization_fuzzycluster, fig.cap = "Time course analysis example", fig.width = 6, fig.height = 6, fig.align = 'center'----
group_levels <- levels(group)
# fuzzy c-means clustering
set.seed(1000)
fuzzy_clustObj <- visualization_fuzzycluster(
fuzzy_input_df, group, group_levels,
k_cluster=9, iteration = 100,
mfrow = c(3,3), min_mem = 0.1,
plot = TRUE
)
# clusters information
clusterS_info <- fuzzy_clustObj$cluster
clusterS_names <- names(clusterS_info)
clusters_df <- data.frame(clusterS_names, clusterS_info)
# write.csv(clusters_df, 'clusters_df.csv', row.names = TRUE)
## -----------------------------------------------------------------------------
# For early and late response
# early -> clusterS_info==1
# late -> clusterS_info==2
cluster_flag <- 'early'
cluster_symbol <- clusterS_names[clusterS_info==1]
expr_data_frame <- data_frame_normalization_with_control_no_pair
index_of_cluster <- match(cluster_symbol, expr_data_frame$ID)
cluster_df <- expr_data_frame[index_of_cluster,]
## ----fig.cap = "KSEA method", fig.width = 6, fig.height = 5, fig.align = 'center'----
# Perform KSEA
summary_df_list_from_ksea_cluster <- get_summary_from_ksea(cluster_df, species = 'human', log2_label = FALSE, ratio_cutoff = 3)
# Activity of regulons for regulation
ksea_regulons_activity_df_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df
ksea_id_cluster <- as.vector(ksea_regulons_activity_df_cluster[,1])
ksea_value_cluster <- ksea_regulons_activity_df_cluster[,-1]
if(FALSE){
# Pvalue of regulons for regulation
ksea_regulons_pvalue_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_pvalue_df
# Activity of regulons for regulation
ksea_regulons_activity_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df
# Expression ratio of regulons for regulation
ksea_kinase_site_substrate_original_ratio_cluster <- summary_df_list_from_ksea_cluster$ksea_kinase_site_substrate_original_ratio_df
}
# plot pheatmap
if(TRUE){
# annotation setting
annotation_col <- data.frame(
group = group
)
rownames(annotation_col) <- colnames(ksea_value_cluster)
# breaks and colors setting
breaks_1 <- seq(-4, -2, 0.2)
colors_1 <- colorRampPalette(c('#11264f', '#145b7d'))(length(breaks_1)-1)
breaks_2 <- seq(-2, -1, 0.2)
colors_2 <- colorRampPalette(c('#145b7d', '#009ad6'))(length(breaks_2))
breaks_3 <- seq(-1, 1, 0.2)
colors_3 <- colorRampPalette(c('#009ad6', 'white', '#FF6600'))(length(breaks_3))
breaks_4 <- seq(1, 2, 0.2)
colors_4 <- colorRampPalette(c('#FF6600', 'red'))(length(breaks_4))
breaks_5 <- seq(2, 4, 0.2)
colors_5 <- colorRampPalette(c('red', 'firebrick'))(length(breaks_5))
breaks <- c(breaks_1, breaks_2, breaks_3, breaks_4, breaks_5)
breaks <- breaks[which(!duplicated(breaks))]
color <- c(colors_1, colors_2, colors_3, colors_4, colors_5)
color <- color[which(!duplicated(color))]
library(pheatmap)
ph <- pheatmap(
ksea_value_cluster,
scale = 'none',
annotation_col = annotation_col,
clustering_distance_rows = 'euclidean',
fontsize_row = 5,
# cutree_rows = 1,
show_rownames = TRUE,
fontsize_col = 5,
# cutree_cols = 1,
cluster_cols = FALSE,
border_color = 'black',
cellwidth = 5, cellheight = 5,
breaks = breaks,
color = color,
legend_breaks = c(-4, -2, -1, 0, 1, 2, 4),
legend_labels = c(-4, -2, -1, 0, 1, 2, 4),
main = paste('Kinase-substrate enrichment analysis', cluster_flag, sep = ' ')
)
}
## ----eval = FALSE-------------------------------------------------------------
# # get kinase activity matrix with multiple linear regression (mlr) method
# kinase_activity_df_mlr <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mlr')
## ----eval = FALSE-------------------------------------------------------------
# # get kinase activity matrix with mean value method
# kinase_activity_df_mean <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mean')
## ----get_aligned_seq_for_mea--------------------------------------------------
# *** foreground ***
foreground_data <- phospho_data_filtering_STY_and_normalization # pre-processed data
foreground_sequence <- as.vector(foreground_data$Sequence)
GI <- as.vector(foreground_data$GI)
Sequence <- as.vector(foreground_data$Sequence)
AA_in_protein <- as.vector(foreground_data$AA_in_protein)
# *** required parameters ***
fixed_length <- 15
species <- 'human'
motifx_pvalue <- 0.01
# get foreground data frame
# foreground_df = get_aligned_seq_for_mea(ID, Sequence, AA_in_protein, fixed_length, species = 'human', fasta_type = 'refseq')
# get background data frame
# background_df = get_global_background_df(species = 'human', fasta_type = 'refseq')
## ----mea_based_on_background--------------------------------------------------
# construct foreground and background
# To facilitate testing the module, select an appropriate number of items at random.
foreground <- as.vector(foreground_df$aligned_seq)
foreground <- foreground[sample(length(foreground), 1000)]
background <- as.vector(background_df$Aligned_Seq)
background <- background[sample(length(background), 10000)]
motifs_list <- mea_based_on_background(foreground, AA_in_protein, background, motifx_pvalue)
# Find sequences in foreground that are mapped to specific motif
foreground_sequences_mapped_to_motifs <- get_foreground_seq_to_motifs(motifs_list, foreground)
# Find data frame in foreground that are mapped to specific motif
foreground_df_mapped_to_motifs <- get_foreground_df_to_motifs(foreground_sequences_mapped_to_motifs, foreground, foreground_df)
## ----ggseqlogo, fig.cap = "Plot motif logo", fig.width = 6, fig.height = 6, fig.align = 'center'----
# The data can be used for ploting logo of sepcific motif: foreground_sequences_mapped_to_motifs
# ploting logo: Q......S.......
require(ggseqlogo)
ggseqlogo(foreground_sequences_mapped_to_motifs[[1]])
if(TRUE){
# batch plot and count peptides for each motif
foreground_sequences_mapped_to_motifs_count <- length(foreground_sequences_mapped_to_motifs)
motifs <- names(foreground_sequences_mapped_to_motifs)
peptides_count <- NULL
for(i in seq_len(foreground_sequences_mapped_to_motifs_count)){
l_i <- foreground_sequences_mapped_to_motifs[[i]]
peptides_count <- c(peptides_count, length(l_i))
}
motifs_peptides_count_df <- data.frame(motifs, peptides_count)
# quantile(peptides_count, seq(0,1,0.05))
if(FALSE){
plot_seqlogo(BASE_DIR, foreground_sequences_mapped_to_motifs, plot_min_seqs = 25)
}
}
## ----Assign quantitative values of peptides to their motif, fig.cap = "KSEA method", fig.width = 6, fig.height = 5, fig.align = 'center'----
# Select motifs at least having 50 peptides
# Assign quantitative values of peptides to their motif
foreground_value <- foreground_data[,-c(seq(1,6))]
min_seqs <- 50
index_of_motifs <- which(peptides_count>=min_seqs)
motif_group_m_ratio_df <- NULL
for(i in index_of_motifs){
motif <- motifs[i]
aligned_peptides <- foreground_sequences_mapped_to_motifs[[i]]
index_of_match <- match(aligned_peptides, foreground_df$aligned_seq)
motif_value <- foreground_value[index_of_match,]
motif_value_colsum <- colSums(motif_value)
motif_group_m <- tapply(motif_value_colsum, group, mean)
motif_group_m_ratio <- motif_group_m/motif_group_m[1]
motif_group_m_ratio_df <- rbind(motif_group_m_ratio_df, motif_group_m_ratio)
}
motifs_subset <- motifs[index_of_motifs]
peptides_count_subset <- peptides_count[index_of_motifs]
rownames(motif_group_m_ratio_df) <- paste(motifs_subset, peptides_count_subset)
# The matrix is import inot pheatmap
motif_group_m_ratio_df_mat <- as.matrix(motif_group_m_ratio_df)
# plot pheatmap
if(TRUE){
# library(pheatmap)
# breaks and colors setting
breaks_1 <- seq(0, 0.5, 0.1)
colors_1 <- colorRampPalette(c('green', 'blue'))(length(breaks_1)-1)
breaks_3 <- seq(0.5, 1.5, 0.1)
colors_3 <- colorRampPalette(c('blue', 'white', '#FFBFBF'))(length(breaks_3))
breaks_4 <- seq(1.5, 2, 0.1)
colors_4 <- colorRampPalette(c('#FFBFBF', 'red'))(length(breaks_4))
breaks_5 <- seq(2, 4, 0.1)
colors_5 <- colorRampPalette(c('red','firebrick'))(length(breaks_5))
breaks <- c(breaks_1, breaks_3, breaks_4, breaks_5)
breaks <- breaks[which(!duplicated(breaks))]
colors <- c(colors_1, colors_3, colors_4, colors_5)
colors <- colors[which(!duplicated(colors))]
length(breaks)
length(which(!duplicated(colors)))
ph <- pheatmap(
motif_group_m_ratio_df_mat,
scale = 'none',
# annotation_col = annotation_col,
clustering_distance_cols = 'euclidean',
fontsize_row = 6, cutree_rows = 1, show_rownames = TRUE, cluster_rows = TRUE,
fontsize_col = 6, cutree_cols = 1, show_colnames = TRUE, cluster_cols = FALSE,
border_color = 'black',
# color = colors,
cellwidth = 12, cellheight = 12,
breaks = breaks,
color = colors,
legend_breaks = c(0, 0.5, 1, 1.5, 2, 4),
legend_labels = c(0, 0.5, 1, 1.5, 2, 4),
main = 'Motif enrichment analysis'
)
}
## ----formatted_output_foreground_sequences_mapped_to_motifs-------------------
# formatting output
formatted_output_df <- formatted_output_mef_results(foreground_sequences_mapped_to_motifs)
# write file
# write.table(formatted_output_df, 'formatted_output_df.txt', row.names = FALSE, col.names = FALSE, sep = '\t')
## ----session------------------------------------------------------------------
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.