R/DIAUmpiretoMSstatsFormat.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Documented in DIAUmpiretoMSstatsFormat

## raw.frag : FragSummary
## raw.pep : PeptideSummary with selected_fragmnet
## raw.pro : ProteinSummary with selected_peptide

## useUniquePeptide : remove peptides that are assigned for more than one proteins. We assume to use unique peptide for each protein.
## summaryforMultipleRows : max or sum - when there are multiple measurements for certain feature and certain fun, use highest or sum of all.
## fewMeasurements : if 1 or 2 measurements across runs per feature, 'remove' will remove those featuares. It can affected for unequal variance analysis.

#' @export
#' @importFrom dplyr %>% left_join semi_join group_by summarise
#' @importFrom tidyr separate_rows
#' @import stringr
DIAUmpiretoMSstatsFormat <- function(raw.frag, raw.pep, raw.pro, 
                                annotation,
                                useSelectedFrag = TRUE,
                                useSelectedPep = TRUE,
                                fewMeasurements="remove",
                                removeProtein_with1Feature = FALSE,
                                summaryforMultipleRows=max){
	
    if (is.null(fewMeasurements)) {
        stop('** Please select \'remove\' or \'keep\' for \'fewMeasurements\'.')
    }
    
    if (!is.element(fewMeasurements, c('remove', 'keep'))) {
        stop('** Please select \'remove\' or \'keep\' for \'fewMeasurements\'.')
    }
    
    if (is.null(annotation)) {
        stop('** Please prepare \'annotation\' as one of input.')
    } else {
        ## check annotation
        required.annotation <- c('Condition', 'BioReplicate', 'Run')
        
        if (!all(required.annotation %in% colnames(annotation))) {
            
            missedAnnotation <- which(!(required.annotation %in% colnames(annotation)))
            
            stop(paste("**", toString(required.annotation[missedAnnotation]), 
                       "is not provided in Annotation. Please check the annotation file."))
        } else {
            ### annotation.txt : Condition, BioReplicate, Run, 
            annot <- annotation
        }
    }
    
    ## Each Run should has unique information about condition and bioreplicate
    check.annot <- xtabs(~Run, annot)
    if ( any(check.annot > 1) ) {
        stop('** Please check annotation. Each MS run can\'t have multiple conditions or BioReplicates.' )
    }
    
    ########################
    ## 1.  get selected frag from DIA-Umpire
    ########################
    
    ## need to check whether 'Selected_fragments' column is available or not.
    if (!any(is.element(colnames(raw.pep), 'Selected_fragments'))) {
        stop('** Selected_fragments column is required. Please check it.')
    }
    
    ## Selected_fragments
    raw.pep2 <- raw.pep[, c('Peptide.Key', 'Proteins', 'Selected_fragments')]
    
    ## remove empty protein name
    raw.pep2 <- raw.pep2[raw.pep2$Proteins != '', ]
    #raw.pep2$Proteins <- gsub(';', '', raw.pep2$Proteins) ## be careful for multiple protein ids
    raw.pep2$Selected_fragments <- as.character(raw.pep2$Selected_fragments)
    
    ## get infor for selected fragment
    raw.pep3 <- raw.pep2 %>%
        separate_rows(Selected_fragments, sep="[|]")
    ## raw.pep3 has selected fragment information
    
    ########################
    ## 2. get selected peptides from DIA-Umpire
    ########################
    
    if (!any(is.element(colnames(raw.pro), 'Selected_peptides'))) {
        stop('** Selected_peptides column is required. Please check it.')
    }
    
    ## Selected_peptide
    raw.pro2 <- raw.pro[, c('Protein.Key', 'Selected_peptides')]
    
    ## remove empty protein name
    raw.pro2 <- raw.pro2[raw.pro2$Protein.Key != '', ]
    raw.pro2$Selected_peptides <- as.character(raw.pro2$Selected_peptides)
    
    ## get info for selected peptides
    raw.pro3 <- raw.pro2 %>%
        separate_rows(Selected_peptides, sep="[|]")

    ## top 6 peptides are selected, raw.pro3 has selected peptide information
    
    ########################
    ## 3. subtract the rows with selected peptides and fragments
    ########################
    colnames(raw.pro3) <- c('Protein', 'Peptide')
    colnames(raw.pep3) <- c('Peptide', 'Protein', 'Fragment')
    
    ## some proteins have no selected peptides
    ## ex : raw.pro3[raw.pro3$Peptide == '', ]
    ## !! if empty for selected peptide or fragments: 
    ## all zero or NA if there is no peptide information in protein
    ## remove all, none of peptide are selected
    ## conclusion, if 'selected_xxx' is empty, no selected.

    raw.pro3 <- raw.pro3[raw.pro3$Peptide != '', ]
    raw.pro3$Peptide <- as.character(raw.pro3$Peptide)
    
    raw.pep3 <- raw.pep3[raw.pep3$Fragment != '', ]
    raw.pep3$Peptide <- as.character(raw.pep3$Peptide)
    
    if (useSelectedFrag & useSelectedPep) {
        
        raw.pep3$Protein <- gsub(';', '', raw.pep3$Protein)
        
        ## remove the shared peptides
        count <- xtabs(~Peptide, raw.pro3)
        notuni.pep <- names(count)[count > 1]
        
        if (length(notuni.pep) > 0){
            ## remove not-unique peptide in proteien quantification
            raw.pro3 <- raw.pro3[-which(raw.pro3$Peptide %in% notuni.pep), ]
            raw.pep3 <- raw.pep3[-which(raw.pep3$Peptide %in% notuni.pep), ]
        } 
        
        ## get selected peptides only in raw.pro3
        raw.pep3 <- raw.pep3[which(raw.pep3$Peptide %in% unique(raw.pro3$Peptide)), ]
        unique(nchar(as.character(unique(raw.pep3$Protein))))
        
        ## The number of peptides are the same in raw.pep3 or raw.pro3.
        ## but the number of proteins are different.
        ## usd these selected peptides, and use protein id in raw.pro3. because pro3 assign one protein id for each peptides
        
        ## now raw.pep3 can be used for final list
        ## but need to assign protein id from raw.pro3. It should be updated in raw.frag.
        ## raw.frag can have multiple protein ids.
        raw.pep3 <- raw.pep3[, -which(colnames(raw.pep3) %in% 'Protein')]
        
        ## make the final list of selected proteins and peptides
        raw.final <- left_join(raw.pep3, raw.pro3, by=c('Peptide'))
        raw.final$Protein <- as.character(raw.final$Protein)
        
        message('** Get the selected fragments and peptides.')
        
        #length(unique(raw.final$Peptide)) # 12975
        #length(unique(raw.final$Protein)) # 3627
        #length(unique(raw.pep3$Peptide))
        #length(unique(raw.pro3$Protein))
        
    } else if (useSelectedFrag & !useSelectedPep) { # only use selected fragment, and use all peptides
        
        ## can use raw.pep3 only.
        ## but doublecheck protein id : protein id in raw.frag is always one id.
        ## need to find shared peptides
        count.pro <- str_count(raw.pep3$Protein, '\\;')

        ## remove not-unique peptide in proteien quantification
        raw.pep3 <- raw.pep3[count.pro < 2, ]
        
        raw.pep3$Protein <- gsub(';', '', raw.pep3$Protein)
        
        raw.final <- raw.pep3
        
        message('** Got the selected fragments.')
        
    } else if (!useSelectedFrag & !useSelectedPep) {
        
        stop('** MSstats recommends to use at least selected fragments.')
    }
    

    ############################
    ## 4. subtract fragment data with selected peptide and fragment
    ############################
    
    ## 0.1  check protein id
    raw.frag$Protein <- as.character(raw.frag$Protein)
    
    ## 0.2  reduce column
    inputlevel <- 'Intensity'
    
    ## columns which include 'inputlevel
    int.column <- colnames(raw.frag)[grep(inputlevel, colnames(raw.frag))]
    deft.column <- c('Fragment.Key', 'Protein','Peptide','Fragment')
    
    ## get subset of data first
    raw <- raw.frag[, which(colnames(raw.frag) %in% c(int.column, deft.column))]
    
    ## 0.3 update fragment info
    raw$Fragment <- as.character(raw$Fragment)
    
    if (length(grep('\\+', raw$Fragment[1])) < 1) { ## if Fragment column does not have charge
        
        raw$Fragment.Key <- as.character(raw$Fragment.Key)
        raw$ion <- substr(raw$Fragment.Key, start=nchar(raw$Fragment.Key), stop = nchar(raw$Fragment.Key))
        raw$Fragment <- paste(raw$Fragment, raw$ion, sep="_")
        
        raw <- raw[, -which(colnames(raw) %in% c('ion'))]
        
    } else {
        raw$Fragment <- gsub('\\+', '_', raw$Fragment)
    }
    
    ## remove extra column
    raw <- raw[, -which(colnames(raw) %in% c('Fragment.Key'))]
    
    ## 0.4 get selected fragment and peptide only
    raw$Peptide <- as.character(raw$Peptide)
    raw$Fragment <- as.character(raw$Fragment)
    raw$Fragment <- as.character(raw$Fragment)
    
    raw.selected <- semi_join(raw, raw.final, by=c('Protein', 'Peptide', 'Fragment'))
    
    message('** Extract the data from selected fragments and/or peptides.')
    
    ############################
    ## 5. reformat
    ############################
    ## change column names
    colnames(raw.selected)[colnames(raw.selected) == 'Protein'] <- 'ProteinName'
    colnames(raw.selected)[colnames(raw.selected) == 'Peptide'] <- 'PeptideSequence'
    colnames(raw.selected)[colnames(raw.selected) == 'Fragment'] <- 'FragmentIon'
    
    ## make long format
    raw.l <- melt(raw.selected, id.vars=c('ProteinName', 'PeptideSequence', 'FragmentIon'),
                  variable.name='Run', value.name='Intensity')
    
    ## remove suffix in the Run
    raw.l$Run <- gsub(paste("_", inputlevel, sep=""), '', raw.l$Run)
    
    ## fill in other column
    raw.l$PrecursorCharge <- NA
    raw.l$ProductCharge <- NA
    raw.l$IsotopeLabelType <- 'light'
    
    ## 4. add annotation
    input <- merge(raw.l, annot, by='Run')

    rm(raw.l)
    rm(raw.selected)
    
    ##############################
    ## 6. remove featuares with all na or zero
    ## some rows have all zero values across all MS runs. They should be removed.
    ##############################
    
    input$fea <- paste(input$PeptideSequence,
                       input$PrecursorCharge,
                       input$FragmentIon,
                       input$ProductCharge,
                       sep="_")
    
    inputtmp <- input[!is.na(input$Intensity) & input$Intensity > 1, ]
    
    count <- inputtmp %>% group_by(fea) %>% summarise(length=length(Intensity))
    
    ## get feature with all NA or zeros
    getfea <- count[count$length > 0, 'fea']
    
    if (nrow(getfea) > 0) {
        
        nfea.remove <- length(unique(input$fea)) - nrow(getfea)
        input <- input[which(input$fea %in% getfea$fea), ]
        
        message(paste0('** ', nfea.remove, ' features have all NAs or zero intensity values and are removed.'))
        
    }
    
    rm(inputtmp)
    
    ##############################
    ## 7. remove features which has 1 or 2 measurements across runs
    ##############################
    if (fewMeasurements=="remove") {
        
        ## it is the same across experiments. # measurement per feature. 
        xtmp <- input[!is.na(input$Intensity) & input$Intensity > 0, ]
        count_measure <- xtabs(~ fea, xtmp)
        
        remove_feature_name <- count_measure[count_measure < 3]
        
        if (length(remove_feature_name) > 0) {
            
            input <- input[-which(input$fea %in% names(remove_feature_name)), ]
            
            message(paste0('** ', length(remove_feature_name), 
                           ' features have 1 or 2 intensities across runs and are removed.'))
            
        }
    }
    
    ##############################
    ## 8. remove proteins with only one peptide and charge per protein
    ##############################
    
    if (removeProtein_with1Feature) {
        
        ## remove protein which has only one peptide
        tmp <- unique(input[, c("ProteinName", 'fea')])
        tmp$ProteinName <- factor(tmp$ProteinName)
        count <- xtabs( ~ ProteinName, data=tmp)
        lengthtotalprotein <- length(count)
        
        removepro <- names(count[count <= 1])
        
        if (length(removepro) > 0) {
            
            input <- input[-which(input$ProteinName %in% removepro), ]
            
            message(paste0("** ", length(removepro), 
                           ' proteins, which have only one feature in a protein, are removed among ', 
                           lengthtotalprotein, ' proteins.'))
        } else {
            
            message("All proteins have at least two features.")
            
        }
    }
    
    ##############################
    ## 9. remove multiple measurements per feature and run
    ##############################
    
    count <- aggregate(Intensity ~ fea, data=input, FUN=length)
    
    ## if any feature has more number of total MS runs, 
    if (any(unique(count$Intensity) > length(unique(input$Run)))){
        
        annotation <- unique(input[, c("Condition", "BioReplicate", "Run")])
        
        ## maximum or sum up abundances among intensities for identical features within one run
        input_w <- dcast( ProteinName + PeptideSequence + PrecursorCharge + FragmentIon + ProductCharge ~ Run, data=input, 
                          value.var='Intensity', 
                          fun.aggregate=summaryforMultipleRows, 
                          fill=NA_real_) 
        
        ## reformat for long format
        input <- melt(input_w, id=c('ProteinName', 'PeptideSequence', 'PrecursorCharge', 'FragmentIon', 'ProductCharge'))
        colnames(input)[which(colnames(input) %in% c('variable','value'))] <- c("Run","Intensity")
        
        ### add annotation
        input <- merge(input, annotation, by="Run")
        
        ## reorder columns
        input <- input[, c(2,3,4,5,6,8,1,9,7)]
        
        message('** Multiple measurements in a feature and a run are summarized by summaryforMultipleRows.')
        
    } else {
        ## remove column, named as 'fea'
        input <- input[, -which(colnames(input) %in% c('fea'))]
        message('** No multiple measurements in a feature and a run.')
    }
    
    input$IsotopeLabelType <- 'L'
    input$ProteinName <- factor(input$ProteinName)

	return(input)
}

Any scripts or data that you put into this service are public.

MSstats documentation built on Feb. 28, 2021, 2:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MSstats
Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/DIAUmpiretoMSstatsFormat.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Defines functions DIAUmpiretoMSstatsFormat

Documented in DIAUmpiretoMSstatsFormat

Try the MSstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MSstats Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/DIAUmpiretoMSstatsFormat.R In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

Defines functions DIAUmpiretoMSstatsFormat

Documented in DIAUmpiretoMSstatsFormat

Try the MSstats package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

MSstats
Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments

R/DIAUmpiretoMSstatsFormat.R
In MSstats: Protein Significance Analysis in DDA, SRM and DIA for Label-free or Label-based Proteomics Experiments