# find mean profiles for proteins, or for protein-peptide combinations
# requires "snowfall" package for multi-core processing
# service function for profileSummarize
# @param i protein in row i
# @param uniqueLabel either protId or pepId as specified by "GroupBy" in profileSummarize
# @param protsCombineCnew dataframe of protein profiles
# @param numRefCols number of columns preceding the data
# @param numDataCols number of numerical values in a profile
# @param GroupBy "protId" if average by protein; "peptideId" if average by peptide
# @param eps small value to add so that log argument is greater than zero
# @param outlierExclude none, spectra, or spectraAndPeptide
# @return result_i coefficient estimates, numbers of peptides and spectra, and id for protein i
#' @importFrom stats var
#' @importFrom lme4 lmer
#' @importFrom lme4 fixef
#' @importFrom stats vcov
#' @importFrom stats sd
#' @importFrom stats median
#
#' @export
meansByProteins <- function(i, uniqueLabel, protsCombineCnew, numRefCols, numDataCols,
GroupBy, eps, outlierExclude) {
# outlierExclude:
# none: don't exclude any outliers
# spectra: exclude only spectra within peptides
# spectraAndPeptide: exclude spectra-within-peptide outliers and peptide outliers
# i=217
# i=2
protData_i <- protsCombineCnew[uniqueLabel == i, ]
prot_i <- protData_i$prot[1]
protId_i <- protData_i$protId[1]
pep_i <- protData_i$peptide[1]
pepId_i <- protData_i$pepId[1]
profileAll_i_x <- cbind(protData_i[,numRefCols + c(seq_len(numDataCols))], protData_i$protId, protData_i$pepId)
names(profileAll_i_x)[ numDataCols + 1] <- "protId"
names(profileAll_i_x)[ numDataCols + 2] <- "pepId"
profileAll_i <- profileAll_i_x
profileAll_i[,seq_len(numDataCols)] <- log2(profileAll_i_x[,seq_len(numDataCols)] + eps) # transform to log2 scale
# identify and eliminate outliers
#use_i <- {protData_i$outCountBoxPlot == 0}
if (outlierExclude == "none") {
use_i <- rep(T, nrow(protData_i))
}
if (outlierExclude == "spectra") {
use_i <- {{protData_i$outlier.num.spectra == 0} & {!is.na(protData_i$outlier.num.spectra)}}
}
if (outlierExclude == "spectraAndPeptide") {
use_i <- {{protData_i$outlier.num.spectra == 0} & {!is.na(protData_i$outlier.num.spectra)} &
{protData_i$outlier.num.peptides == 0} & {!is.na(protData_i$outlier.num.peptides)} }
}
profileXX_i <- profileAll_i[use_i,]
Nspectra <- nrow(profileXX_i)
if (Nspectra == 0) {
coef_est <- rep(NA, numDataCols)
secoef_est <- rep(NA, numDataCols)
Npep <- 0
result_i <- c(coef_est, secoef_est, Nspectra, Npep, protId_i, pepId_i)
return(result_i)
}
Npep <- length(unique(profileXX_i$pepId))
# need at least 4 spectra and 3 unique peptides
lmerGood <- {{Nspectra >= 4} & {Npep >= 3}}
if (Nspectra == Npep) lmerGood <- FALSE # don't do if one seq per spectrum
if (GroupBy == "peptideId") lmerGood <- FALSE # makes no sense if by peptide
if (lmerGood) {
coef_est <- rep(NA, numDataCols)
secoef_est <- rep(NA, numDataCols)
for (k in seq_len(numDataCols)) { # do for each fraction
#k=1
y_k <- profileXX_i[,k] # k'th column (fraction)
varOK <- {var(y_k) > 0.001}
#if (varOK) {
result_k <- try(lmer(y_k ~ 1 + (1 | pepId), data=profileXX_i))
try(coef_est[k] <- fixef(result_k))
try(secoef_est[k] <- sqrt(as.numeric(vcov(result_k))))
# }
if (!varOK | is.na(coef_est[k])) { # variance too small for lmer;
coef_est[k] <- mean(y_k)
secoef_est[k] <- 0.001
}
if (is(result_k, "try-error")) {
coef_est <- as.numeric(apply(profileXX_i[,seq_len(numDataCols)],2,mean))
secoef_est <- as.numeric(apply(profileXX_i[,seq_len(numDataCols)],2,sd))/sqrt(Nspectra)
if (secoef_est < 0.01) se.coef_est <- 0.01
}
}
}
if ({!lmerGood} & {Nspectra == 1}) {
coef_est <- as.numeric(profileXX_i[1,seq_len(numDataCols)])
secoef_est <- rep(NA, numDataCols)
}
if ({!lmerGood} & {Nspectra == 2}) {
coef_est <- as.numeric(apply(profileXX_i[,seq_len(numDataCols)],2,mean))
secoef_est <- rep(NA, numDataCols)
}
if ({!lmerGood} & {Nspectra >= 3} ) {
coef_est <- as.numeric(apply(profileXX_i[,seq_len(numDataCols)],2,mean))
secoef_est <- as.numeric(apply(profileXX_i[,seq_len(numDataCols)],2,sd))/sqrt(Nspectra)
coef_median_est <- as.numeric(apply(profileXX_i[,seq_len(numDataCols)],2,median))
}
# apply(profile99.i,2,mean) # test; should be similar to coef_est
raw_coef_orig <- 2^coef_est - eps
facAdj <- sum(raw_coef_orig) # adjustment factor to make things add to 1
raw_coef <- raw_coef_orig/facAdj
coef_est <- log2(raw_coef + eps)
# if GroupBy = "protId", prot_i is the protein number, and i is the same
# if GroupBy = "pepId", prot_i is the protein number, and i is the peptide number
result_i <- c(coef_est, secoef_est, Nspectra, Npep, protId_i, pepId_i)
return(result_i)
}
# ================================================================
#' protProfileSummarize calculates mean and SE for each channel in
#' each prot using random effect model or arithmetic mean
#' random effect model can avoid dominance of a sequence with
#' too many spectra
#'
#' @param protsCombineCnew a matrix of protein information and normalized specific amounts and outlier information
#' @param numRefCols number of columns before Mass Spectrometry data columns
#' @param numDataCols how many columns in MS data
#' @param refColsKeep which reference (non-data) columns to keep
#' @param eps epsilon to avoid log(0)
#' @param GroupBy "protId" if average by protein; "peptideId" if average by peptide
#' @param outlierExclude "none", "spectra", or "spectraAndpeptide" (default) according to exclusion level
#' @param cpus number of cpus to use for parallel processing
#' @importFrom snowfall sfInit
#' @importFrom snowfall sfLibrary
#' @importFrom snowfall sfExport
#' @importFrom snowfall sfLapply
#' @importFrom snowfall sfStop
#' @importFrom lme4 lmer
#' @export
#' @return protProfileSummaryIdentity: mean or weighted mean normalized specific amount profiles
#' @examples
#' set.seed(17356)
#' eps <- 0.029885209
#' flagSpectraBox <- outlierFind(protClass=TLN1_test,
#' outlierLevel="peptide", numRefCols=5, numDataCols=9,
#' outlierMeth="boxplot", range=3, eps=eps,
#' cpus=1, randomError=TRUE)
#' # examine numbers of spectra that are outliers
#' table(flagSpectraBox$outlier.num.spectra)
#' pepProfiles <- profileSummarize(protsCombineCnew=flagSpectraBox,
#' numRefCols=6, numDataCols=9, refColsKeep=c(1,2,4),eps=eps,
#' GroupBy="peptideId", outlierExclude="spectra")
#' str(pepProfiles, strict.width="cut", width=65)
#'
# ================================================================
profileSummarize <- function(protsCombineCnew, numRefCols, numDataCols, refColsKeep=c(1,2),
eps, GroupBy="protId", outlierExclude="spectraAndPeptide", cpus=1) {
# outlierExclude:
# none: don't exclude any outlers
# spectra: (default) exclude only spectra within peptides
# spectraAndPeptide: exclude spectra-within-peptide outliers and peptide outliers
# # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # #
if (GroupBy == "protId") uniqueLabel <- protsCombineCnew$protId
if (GroupBy == "peptideId") uniqueLabel <- protsCombineCnew$pepId
#if (GroupBy == "protId") indList <- unique(protsCombineCnew$protId)
#if (GroupBy == "peptideId") indList <- unique(protsCombineCnew$pepId)
indList <- unique(uniqueLabel) # 28 Jan 2022
if (!(outlierExclude %in% c("none", "spectra", "spectraAndPeptide"))) {
cat("outlierExclude must be one of none, spectra, or peptide\n")
stop()
}
if ((GroupBy == "peptideId") & (outlierExclude == "spectraAndPeptide")) {
cat("if GroupBy == \'peptideId\' then outlierExclude cannot equal \'spectraAndPeptide\' \n")
stop()
}
#indList <- unique(uniqueLabel)
n_prots <- length(indList) # either proteins or proteins/peptides
#library(lme4)
#library(snowfall)
sfInit(parallel=TRUE, cpus=cpus)
sfExport("protsCombineCnew") # main data set
sfExport("uniqueLabel")
#sfExport("meansByProteins") # function
sfExport("GroupBy")
#sfExport("outlier")
sfExport("numDataCols")
sfExport("numRefCols")
#sfLibrary(lme4)
system.time(result <- sfLapply(indList, meansByProteins, uniqueLabel=uniqueLabel, protsCombineCnew=protsCombineCnew,
numRefCols=numRefCols, numDataCols=numDataCols,
GroupBy=GroupBy, eps=eps, outlierExclude=outlierExclude))
sfStop()
temp <- do.call(what="rbind", result) # convert list of matrices to one matrix
temp_df <- data.frame(temp)
names_channels <- names(protsCombineCnew)[numRefCols + c(seq_len(numDataCols))]
names(temp_df)[ seq_len(numDataCols)] <- names_channels
names(temp_df)[(numDataCols + 1):( numDataCols + numDataCols)] <- paste(names_channels, ".se", sep="")
names(temp_df)[ 2*numDataCols + seq_len(4)] <- c("Nspectra", "Npep", "protId", "pepId")
if (GroupBy == "protId") refColsKeep <- 2 # only keep the "prot" column
index <- match(temp_df$pepId, protsCombineCnew$pepId) # indices of second data set
protsMini <- protsCombineCnew[index, refColsKeep]
if (GroupBy == "protId") {
#temp_df <- subset(temp_df, select=-c(protId, pepId)) # drop protId and pepId (only for GroupBy="protId")
ncolTemp <- ncol(temp_df)
temp_df <- temp_df[,-c(ncolTemp-1, ncolTemp)] # remove protId and pepId (last two columns)
protProfileSummaryRes <- data.frame(protsMini, temp_df) # "protsMini" is protein names
names(protProfileSummaryRes)[1] <- "prot"
}
if (GroupBy == "peptideId") {
protProfileSummaryRes <- data.frame(protsMini,temp_df)
}
# for "Identity" (untransformed) results, drop the standard error terms, which don't apply
if (GroupBy == "protId") {
# note that first two columns are protein and peptide names
protProfileSummaryIdentity <- protProfileSummaryRes[,-((2 + numDataCols):(1 + 2*numDataCols))]
# now reverse the log2 transformation
protProfileSummaryIdentity[,2:(1 + numDataCols)] <- 2^protProfileSummaryRes[,c(2:(1 + numDataCols))] - eps
# now eliminate negative values due to roundoff error
negVals <- {{protProfileSummaryIdentity[,2:(1 + numDataCols)] < 0} & {protProfileSummaryIdentity[,2:(1 + numDataCols)] > -1E-10}}
protProfileSummaryIdentity[,2:(1 + numDataCols)][negVals] <- 0
}
if (GroupBy == "peptideId") {
nKeep <- length(refColsKeep)
# note that first column is protein name. (There is no peptide name here)
#protProfileSummaryIdentity <- protProfileSummaryRes[,-((2 + numDataCols):(1 + 2*numDataCols))]
# drop the standard errors of the mean profiles (9 columns)
protProfileSummaryIdentity <- protProfileSummaryRes[,-((nKeep+1 + numDataCols):(nKeep + 2*numDataCols))]
# now reverse the log2 transformation
protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] <- 2^protProfileSummaryRes[,(nKeep+1):(nKeep + numDataCols)] - eps
# now eliminate negative values due to roundoff error
negVals <- {{protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] < 0} & {protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] > -1E-10}}
protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)][negVals] <- 0
}
protProfileSummaryIdentity
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.