R/normalizePhospho.R

Defines functions normalizePhospho

Documented in normalizePhospho

####################################################################
## Pairwise Normalization of MS-based phosphoproteomic data       ##
## Sohrab Saraei                                                  ##
## This function compensates for the bias introduced in global    ##
## phosphorylation in the sample after using median normalization.##
####################################################################

normalizePhospho <- function(enriched, non.enriched, phospho = NULL, samplesCols, modseqCols, techRep, plot.fc=NULL)
{
  #Check if all the necessary arguments are present
  if(missing(enriched)) stop("The function parameter (enriched) is missing!")
  if(missing(non.enriched)) stop("The function parameter (non.enriched) is missing!")
  if(missing(samplesCols)) stop("The function parameter (samplesCols) is missing!")
  if(missing(modseqCols)) stop("The function parameter (modseqCols) is missing!")
  if(missing(techRep)) stop("The function parameter (techRep) is missing!")
  
  #A function that checks the types of the columns of a data.frame
  
  #Check the type of the arguments
  class.dfCols <- function(df, types)
  {
    all(apply(df, MARGIN = 2, function(x) class(x)[1]) %in% types)
  }
  if(!is(enriched, "data.frame") & !is(enriched, "MSnSet"))
    stop("The enriched parameter must be of either types of data.frame or MSnSet")
  
  if(!is(non.enriched, "data.frame") & !is(non.enriched, "MSnSet"))
    stop("The non.enriched parameter must be of either types of data.frame or MSnSet")
  
  if(is(enriched, "MSnSet")) 
  {
    enriched.eset <- enriched
    enriched <- cbind(MSnbase::fData(enriched)[, modseqCols$enriched], MSnbase::exprs(enriched)[, samplesCols$enriched])
    non.enriched <- cbind(MSnbase::fData(non.enriched)[, modseqCols$non.enriched], MSnbase::exprs(non.enriched)[, samplesCols$non.enriched])
    modseqCols <- data.frame(enriched = c(1,2), non.enriched = c(1,2))
    samplesCols <- data.frame(enriched = 3:ncol(enriched), non.enriched = 3:ncol(non.enriched))
  }
  
  if(!is.null(phospho) & !methods::is(phospho, "character"))
    stop("The phospho parameter must be of type of character")
  
  if(!methods::is(samplesCols, "data.frame") | 
     ncol(samplesCols) != 2 | !class.dfCols(samplesCols, c("integer","numeric")) | 
     all(!colnames(samplesCols) %in% c("enriched", "non.enriched")))
    stop("The samplesCols must be data.frame with two columns, with the column names enriched and non.enriched
         , of type numeric or integer, which must contain the column number of samples that hold the abundances")
  
  if(!methods::is(modseqCols, "data.frame") | 
     ncol(modseqCols) != 2 | !class.dfCols(modseqCols, c("integer","numeric")) | 
     all(!colnames(modseqCols) %in% c("enriched", "non.enriched")))
    stop("The modseqCols must be data.frame with two columns, with the names enriched and non.enriched
         , of type numeric or integer, which must contain the column number of samples that hold the sequence 
         and modifications of the peptides")
  
  if(any(!class.dfCols(enriched[, modseqCols$enriched], "character") ,
         !class.dfCols(non.enriched[, modseqCols$non.enriched], "character")))
    stop("The sequence and modification columns that is specified are not of type charachter!")
  
  if(any(!class.dfCols(enriched[, samplesCols$enriched], c("numeric", "integer")) ,
         !class.dfCols(non.enriched[, samplesCols$non.enriched], c("numeric", "integer"))))
    stop("The samples specified are not of type of numeric")
  
  mod = seq = NULL
  enriched.original.mat <- as.matrix(enriched[, samplesCols$enriched])
  seqMod <- enriched[, modseqCols$enriched]
  #Removing peptides with non-quantified values across the runs
  enriched <- enriched[apply(X = enriched[ ,samplesCols$enriched], MARGIN = 1, 
                             function(x) all(x != 0)),]
  non.enriched <- non.enriched[apply(X = non.enriched[,samplesCols$non.enriched], MARGIN = 1, 
                                     function(x) all(x != 0)),]
  
  colnames(enriched)[modseqCols$enriched] <- c("seq", "mod")
  colnames(non.enriched)[modseqCols$non.enriched] <- c("seq", "mod")
  
  if(is.null(phospho))
  {
    enriched <- enriched[grepl(pattern = "Phospho", x = enriched$mod),]
    non.enriched <- non.enriched[grepl(pattern = "Phospho", x = non.enriched$mod),]
  } else {
    enriched <- enriched[grepl(pattern = phospho, x = enriched$mod),]
    non.enriched <- non.enriched[grepl(pattern = phospho, x = non.enriched$mod),]
  }
  
  enriched <- plyr::ddply(enriched, plyr::.(seq, mod),
                          function(df) colSums(df[, samplesCols$enriched]))
  non.enriched <- plyr::ddply(non.enriched, plyr::.(seq, mod),
                              function(df) colSums(df[, samplesCols$non.enriched]))
  
  enriched[,"modSeq"] <- paste(enriched$seq, enriched$mod,sep = ", ")
  non.enriched[,"modSeq"] <- paste(non.enriched$seq, non.enriched$mod,sep = ", ")

  inter <- intersect(non.enriched$modSeq, enriched$modSeq)

  stopifnot(length(inter) > 0)
  enriched.olp.idx <- which(enriched$modSeq %in% inter)
  non.enriched.olp.idx <- which(non.enriched$modSeq %in% inter)
  
  enriched.mat <- enriched[enriched.olp.idx, ]
  non.enriched.mat <- non.enriched[non.enriched.olp.idx, ]
  
  enriched.mat <- enriched.mat[order(enriched.mat$modSeq),]
  non.enriched.mat <- non.enriched.mat[order(non.enriched.mat$modSeq),]
  
  enriched.mat <- as.matrix(enriched.mat[, -c(1,2,ncol(enriched))])
  non.enriched.mat <- as.matrix(non.enriched.mat[, -c(1,2,ncol(enriched))])
  if(length(inter) > 1) {
    ratios <- non.enriched.mat/enriched.mat
    
    colnames(ratios) <- as.numeric(techRep)
    ratios.avg <- matrix(nrow = nrow(ratios), ncol = length(levels(techRep)))
    for (tr in levels(techRep)) {
      if(nrow(ratios.avg) == 1) {
        ratios.avg[,as.numeric(tr)] <- mean(ratios[,colnames(ratios) == levels(techRep)[as.numeric(tr)]])
      } else {
        ratios.avg[,as.numeric(tr)] <- rowMeans(ratios[,colnames(ratios) == levels(techRep)[as.numeric(tr)]])
      }
    }
    
    
    max.fc <- log2(ratios.avg[,1]) - log2(ratios.avg[,2])
    
    for (i in 2:(ncol(ratios.avg)-1)) {
      for (j in (i+1):(ncol(ratios.avg))) {
        max.fc <- matrixStats::rowMaxs(cbind(max.fc, log2(ratios.avg[,i]) - log2(ratios.avg[,j])))
      }
    }
    
    boxp <- boxplot(max.fc, plot = FALSE)
    ratios <- ratios[!(max.fc > max(boxp$stats)),]
    
    ratios <- log10(ratios)
    if(methods::is(ratios, "matrix") | methods::is(ratios, "data.frame")) {
      col.sub <- rowMeans(ratios)
    } else {
      col.sub <- mean(ratios)
    }
    
    ratios.norm <- ratios - col.sub
    
    if(methods::is(ratios, "matrix") | methods::is(ratios, "data.frame")) {
      factors <- 10^(matrixStats::colMedians(ratios.norm))
    } else {
      factors <- ratios.norm
    }
  } else {
    factors <- as.numeric(non.enriched.mat/enriched.mat)
  }

  enriched.normalized.mat <- t(t(enriched.original.mat) * factors)
  	if(!is.null(plot.fc)) {
	for(i in plot.fc$control) {
	  for(j in plot.fc$samples) {
		a.original <- rowMeans(log2(enriched.original.mat[,which(techRep==i)]+1),na.rm=TRUE)
		b.original <- rowMeans(log2(enriched.original.mat[,which(techRep==j)]+1),na.rm=TRUE)
		fc.original <- a.original - b.original
		a.normnalized <- rowMeans(log2(enriched.normalized.mat[,which(techRep==i)]+1),na.rm=TRUE)
		b.normnalized <- rowMeans(log2(enriched.normalized.mat[,which(techRep==j)]+1),na.rm=TRUE)
		fc.normnalized <- a.normnalized - b.normnalized
		boxplot(cbind(fc.original, fc.normnalized), range=1.5, outline=FALSE, main=paste0("Peptide log fold changes", " (sample ", j, " vs sample ", i, ")"), names=c("Median normalized","Pairwise normalized"))
		abline(h=0, lty=2)}
	}

	}
  message(paste0("The number of peptides in the intersect is: ", length(inter)))
  message(paste0(length(plot.fc$control) * length(plot.fc$samples), " plots generated. Browse through them."))
  data.frame(seqMod, t(t(enriched.original.mat) * factors)) 
}

Try the phosphonormalizer package in your browser

Any scripts or data that you put into this service are public.

phosphonormalizer documentation built on Dec. 10, 2020, 2:01 a.m.