# service function for outlierFind to identify outliers in a matrix
# using both scoresMod and boxplotMod
#
# @param i protein i
# @param protClass a matrix of protein, peptide identifiers and normalized
# specific amounts
# @param outlierLevel peptide for outlier spectra within peptides, or
# protein for outlier peptides within proteins`
# @param numRefCols number of columns before Mass Spectrometry data columns
# @param numDataCols how many columns in MS data
# @param outlierMeth boxplot (recommended), scores, or none
# @param range the range parameter used for identifying outliers
# @param proba probability to exclude outler for scores method
# @param eps value to add before log2 transfromations (to avoid taking log of zero)
#' @importFrom graphics boxplot
#' @importFrom outliers scores
#' @importFrom stats runif
#' @import snow
# @param randomError TRUE if allow it to be random
# @return indicators of outlier peptides or outlier spectra for a protein or peptide
# @export
outlierFind_i <- function(i, protClass, outlierLevel="peptide",
numRefCols, numDataCols, outlierMeth,
range, proba, eps=eps, randomError) {
# i=9
# ================================================================
# boxplotMod to identify outliers in a vector
# Outliers that are 3 times the interquartile range from either
# quartile; note that the default for boxplots is 1.5 times the
# interquartile range
#
# @param x: a vector of numbers
# @param reject.vec.i: pre-specified rejection locations
# @param range: pre-specified times of IQR range
#
#================================================================
boxplotMod <- function(x, range = range) {
#x[reject.vec] <- NA
outlier.x <- boxplot(x, plot = FALSE, range = range)$out
outlier.ind <- (x %in% outlier.x)
return(outlier.ind*1)
}
#================================================================
# scoresMod to identify outliers in a vector
# Outliers as exceeding the normal probability of proba
#
# @param x: a vector of numbers
# @param reject.vec.i: pre-specified rejection locations
# @param proba: pre-specified normal probability
#
#================================================================
scoresMod <- function(x, reject.vec = NULL, proba = proba) {
x[reject.vec] <- NA
non.na.indices <- (seq_len(length(x)))[!is.na(x)]
ind <- scores(x[!is.na(x)], prob = proba)
ind[is.na(ind)] <- FALSE
outlier.ind <- rep(FALSE, length(x))
outlier.ind[non.na.indices] <- ind
return(outlier.ind*1)
}
if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
if (outlierLevel=="protein") uniqueLabel <- protClass$protId
protClass.i <- protClass[uniqueLabel == i,]
nProt <- nrow(protClass.i) #(or number of peptides)
nczfMatLog.i <- log2(protClass.i[,(numRefCols+1):(numRefCols+numDataCols)] + eps)
if (nProt > 1) {
proteinsAllData <- nczfMatLog.i
n.spectra <- nrow(proteinsAllData)
outIndMat.i <- data.frame(matrix(NA, ncol=numDataCols, nrow=n.spectra))
#out.boxplotMod.i <- outIndMat.i
names(outIndMat.i) <- names(nczfMatLog.i)
for (j in seq_len(numDataCols)) {
# j=1
xx <- as.numeric(proteinsAllData[,j])
pp <- 2^xx - eps # convert back to original scale
# add uniform (0, eps) random number, if "randomError=TRUE"
if (randomError==TRUE) {
n.pp <- length(pp)
eps.random <- runif(n=n.pp, min=0, max=eps)*(pp < 10^(-5))
pp.r <- pp + eps.random
xx <- log2(pp.r + eps)
}
# identify outliers that are 3 times the interquartile range from either quartile
# note that the default for boxplots is 1.5 times the interquartile range
if (outlierMeth=="boxplot") outIndMat.i[,j] <- boxplotMod(xx, range=range)
if (outlierMeth=="scores") outIndMat.i[,j] <- scoresMod(xx, proba=proba)
if (outlierMeth=="none") outIndMat.i[,j] <- 0*length(xx)
}
outlier.num <- apply(outIndMat.i, 1, sum)
}
if (nProt == 1) {
outlier.num <- 0
}
if (outlierLevel=="peptide") outlier.num.spectra <- outlier.num
if (outlierLevel=="protein") outlier.num.peptides <- outlier.num
if (nProt >= 1) {
if (outlierLevel=="peptide") result <- cbind(protClass.i[,seq_len(numRefCols)],outlier.num.spectra,
protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 2)]) # include protId and pepId
if (outlierLevel=="protein") result <- cbind(protClass.i[,seq_len(numRefCols)],outlier.num.peptides,
protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 4)]) # include protId and pepId
}
if (nProt == 0) result <- NULL
names(result)[1] <- "prot"
return(result)
}
# ================================================================
#' outlierFind to identify outliers in a matrix
#' using both scoresMod and boxplotMod
#'
#' @param protClass a matrix of protein, peptide identifiers and normalized
#' specific amounts
#' @param outlierLevel peptide for outlier spectra within peptides, or
#' protein for outlier peptides within proteins`
#' @param numRefCols number of columns before Mass Spectrometry data columns
#' @param numDataCols how many columns in MS data
#' @param outlierMeth boxplot (recommended), scores, or none
#' @param range the range parameter used for identifying outliers
#' @param proba probability to exclude outler for scores method
#' @param eps value to add before log2 transfromations (to avoid taking log of zero)
#' @param cpus number of cpus to use for parallel processing
#' @param randomError T if allow it to be random
#' @return additional column of indicators of outlier peptides or outlier
#' spectra for a set of proteins or peptides
#' @examples
#' set.seed(17356)
#' eps <- 0.029885209
#' data(TLN1_test)
#' flagSpectraBox <- outlierFind(protClass=TLN1_test,
#' outlierLevel="peptide", numRefCols=5, numDataCols=9,
#' outlierMeth="boxplot", range=3, eps=eps,
#' cpus=1, randomError=TRUE)
#' # examine numbers of spectra that are outliers
#' table(flagSpectraBox$outlier.num.spectra)
#' @importFrom snowfall sfInit
#' @importFrom snowfall sfLibrary
#' @importFrom snowfall sfExport
#' @importFrom snowfall sfLapply
#' @importFrom snowfall sfStop
#' @export outlierFind
outlierFind <- function(protClass, outlierLevel="peptide",
numRefCols=5, numDataCols=9, outlierMeth="boxplot", range=3,
proba=0.99, eps=eps, cpus=4, randomError=TRUE) {
if (cpus > 1) {
#library(snowfall)
#sfInit(parallel=TRUE, cpus=10)
sfInit(parallel=TRUE, cpus=cpus)
#sfLibrary(snowfall)
#sfLibrary("protsummarize")
#sfExport("boxplotMod")
#sfExport("protsCombineC")
#sfExport("protsMatLabels")
#sfExport("uniqueLabel")
#sfExport("scoresMod")
sfExport("numDataCols")
sfExport("numRefCols")
sfExport("eps")
#sfLibrary(outliers)
if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
if (outlierLevel=="protein") uniqueLabel <- protClass$protId
indList <- unique(uniqueLabel)
system.time(result <- sfLapply(indList, outlierFind_i, protClass=protClass,
outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
outlierMeth=outlierMeth, range=range, proba=proba,
eps=eps, randomError=randomError)) # but this works fine!
sfStop()
}
if (cpus==1) {
if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
if (outlierLevel=="protein") uniqueLabel <- protClass$protId
indList <- unique(uniqueLabel)
system.time(result <- lapply(indList, outlierFind_i, protClass=protClass,
outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
outlierMeth=outlierMeth, range=range, proba=proba,
eps=eps, randomError=randomError))
}
#browser()
protsCombineCnew <- do.call(what="rbind", result) # convert list of matrices to one matrix
protsCombineCnew
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.