knitr::opts_chunk$set(echo = TRUE, message = FALSE)
This Vignette describes how the prozor::greedy
function can be used to infer proteins form peptide identifications using the Occam's razor principle. The method tries to find a minimal set of porteins which can explain all the peptides identified.
library(prozor) rm(list = ls()) file = system.file("extdata/IDResults.txt.gz" , package = "prozor") specMeta <- readr::read_tsv(file) head(specMeta)
Annotate peptide sequences with protein sequences from two fasta files on with reviewed entries only (sp) and the other with reviewed and Trembl entries (sp/tr).
length(unique(specMeta$peptideSeq)) upeptide <- unique(specMeta$peptideSeq) resAll <- prozor::readPeptideFasta( system.file("p1000_db1_example/Annotation_allSeq.fasta.gz" , package = "prozor")) resCan <- prozor::readPeptideFasta( system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor")) annotAll <- prozor::annotatePeptides(upeptide, resAll) head(subset(annotAll,select = -proteinSequence)) annotCan <- prozor::annotatePeptides(upeptide, resCan) barplot(c(All = length(resAll),Canonical = length(resCan)))
barplot(c(All = nrow(annotAll), Canonical = nrow(annotCan)), ylab = "# peptide protein matches.")
We can see that using the larger fasta database reduces the proportion of proteotypic peptides.
PCProteotypic_all <- sum(table(annotAll$peptideSeq) == 1) / length(table(annotAll$peptideSeq)) * 100 PCProteotypic_canonical <- sum(table(annotCan$peptideSeq) == 1) / length(table(annotCan$peptideSeq)) * 100 barplot( c(All = PCProteotypic_all, Canonical = PCProteotypic_canonical), las = 2, ylab = "% proteotypic" )
We can now identify a minmal set of proteins explaining all the peptides observed for both databases
library(Matrix) precursors <- unique(subset(specMeta, select = c( peptideModSeq, precursorCharge, peptideSeq )))
library(Matrix) annotatedPrecursors <- merge(precursors , subset(annotAll, select = c(peptideSeq, proteinID)), by.x = "peptideSeq", by.y = "peptideSeq") xx <- prepareMatrix(annotatedPrecursors, proteinID = "proteinID", peptideID = "peptideSeq") image(xx) xxAll <- greedy(xx)
annotatedPrecursors <- merge(precursors , subset(annotCan, select = c(peptideSeq, proteinID)), by.x = "peptideSeq", by.y = "peptideSeq") xx <- prepareMatrix(annotatedPrecursors , proteinID = "proteinID", peptideID = "peptideSeq") image(xx) xxCAN <- greedy(xx)
We see that the number of proteins needed to explain all the peptides is practically identical for both databases. Also in practice using a database with more entries does not lead to more identified proteins. On the contrary, it might even reduce the number of porteins identified.
barplot(c(All_before = length(unique(annotAll$proteinID)), All_after = length(unique(unlist( xxAll ))) , Canonical_before = length(unique(annotCan$proteinID)), Canonical_after = length(unique(unlist( xxCAN )))))
Protein Grouping and Clustering in Scaffold
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.