#' Find outlier spectra within a protein or pedtide i.
#'
#' Service function for outlierFind (see help function for outlierFind).
#'
#' @param i unique identifier for protein i
#' @param protClass a data frame containing profiles associated
#' with either spectra or peptides (see Tutorial 6)
#' @param outlierLevel 'peptide' for outlier spectra within peptides, or
#' 'protein' for outlier peptides within proteins
#' @param numRefCols number of columns (variables) before data columns
#' @param numDataCols number of fractions in each profile
#' @param outlierMeth boxplot (recommended), scores, or
#' none (if no outliers are to be reported)
#' @param range the range parameter used for identifying outliers
#' using the boxplot method
#' @param proba probability to exclude
#' outlier for scores method
#' @param eps small value to add so that log argument is greater than zero
#' @param randomError TRUE if allow it to be random
#' @return New data frame for a single protein or peptide with an
#' additional column that indicates the number of fractions in
#' a profile (peptides or spectra) that are outliers
#' @examples
#' set.seed(63561)
#' eps <- 0.029885209
#' data(spectraNSA_test)
#' uniqueLabel <- spectraNSA_test$pepId[1]
#' flagSpectraBox_i <- outlierFind_i(i=uniqueLabel, protClass=spectraNSA_test,
#' outlierLevel='peptide', numRefCols=5, numDataCols=9,
#' outlierMeth='boxplot', range=3, proba=NA, eps=eps,
#' randomError=TRUE)
#' str(flagSpectraBox_i)
#'
#' @importFrom graphics boxplot
#' @importFrom outliers scores
#' @importFrom stats runif
#' @export
#'
# test
outlierFind_i <- function(i, protClass, outlierLevel = "peptide",
numRefCols, numDataCols, outlierMeth, range, proba,
eps = eps, randomError) {
# i=9
# ================================================================
# boxplotMod to identify outliers in a vector
# Outliers that are 3 times the interquartile
# range from either quartile; note that the
# default for boxplots is 1.5 times the
# interquartile range @param x: a vector of
# numbers @param reject.vec.i: pre-specified
# rejection locations @param range:
# pre-specified times of IQR range
# ================================================================
boxplotMod <- function(x, range = range) {
# x[reject.vec] <- NA
outlier.x <- boxplot(x, plot = FALSE, range = range)$out
outlier.ind <- (x %in% outlier.x)
return(outlier.ind * 1)
}
# ================================================================
# scoresMod to identify outliers in a vector
# Outliers as exceeding the normal
# probability of proba @param x: a vector of
# numbers @param reject.vec.i: pre-specified
# rejection locations @param proba:
# pre-specified normal probability
# ================================================================
scoresMod <- function(x, reject.vec = NULL, proba = proba) {
x[reject.vec] <- NA
non.na.indices <- (seq_len(length(x)))[!is.na(x)]
ind <- scores(x[!is.na(x)], prob = proba)
ind[is.na(ind)] <- FALSE
outlier.ind <- rep(FALSE, length(x))
outlier.ind[non.na.indices] <- ind
return(outlier.ind * 1)
}
if (outlierLevel == "peptide")
uniqueLabel <- protClass$pepId
if (outlierLevel == "protein")
uniqueLabel <- protClass$protId
protClass.i <- protClass[uniqueLabel == i, ]
nProt <- nrow(protClass.i) #(or number of peptides)
nczfMatLog.i <- log2(protClass.i[, (numRefCols +
1):(numRefCols + numDataCols)] + eps)
if (nProt > 1) {
proteinsAllData <- nczfMatLog.i
n.spectra <- nrow(proteinsAllData)
outIndMat.i <- data.frame(matrix(NA, ncol = numDataCols,
nrow = n.spectra))
names(outIndMat.i) <- names(nczfMatLog.i)
for (j in seq_len(numDataCols)) {
# j=1
xx <- as.numeric(proteinsAllData[, j])
pp <- 2^xx - eps # convert back to original scale
# add uniform (0, eps) random number,
# if 'randomError=TRUE'
if (randomError == TRUE) {
n.pp <- length(pp)
eps.random <- runif(n = n.pp, min = 0,
max = eps) * (pp < 10^(-5))
pp.r <- pp + eps.random
xx <- log2(pp.r + eps)
}
# identify outliers that are 3 times
# the interquartile range from either
# quartile note that the default for
# boxplots is 1.5 times the
# interquartile range
if (outlierMeth == "boxplot")
outIndMat.i[, j] <- boxplotMod(xx,
range = range)
if (outlierMeth == "scores")
outIndMat.i[, j] <- scoresMod(xx, proba = proba)
if (outlierMeth == "none")
outIndMat.i[, j] <- 0 * length(xx)
}
outlier.num <- apply(outIndMat.i, 1, sum)
}
if (nProt == 1) {
outlier.num <- 0
}
if (outlierLevel == "peptide")
outlier.num.spectra <- outlier.num
if (outlierLevel == "protein")
outlier.num.peptides <- outlier.num
if (nProt >= 1) {
if (outlierLevel == "peptide")
result <- cbind(protClass.i[, seq_len(numRefCols)],
outlier.num.spectra, protClass.i[,
(numRefCols + 1):(numRefCols + numDataCols +
2)]) # include protId and pepId
if (outlierLevel == "protein")
result <- cbind(protClass.i[, seq_len(numRefCols)],
outlier.num.peptides, protClass.i[,
(numRefCols + 1):(numRefCols + numDataCols +
4)]) # include protId and pepId
}
if (nProt == 0)
result <- NULL
names(result)[1] <- "prot"
return(result)
}
# ================================================================
#' Identify outlier profiles
#'
#' Identify outlier profiles. This can be done at the level of
#' identifying outlier spectra when calculating peptide profiles or
#' identifying outlier peptides when calculating protein profiles.
#' See Tutorial 6 for a description of outlier determination methods.
#'
#' @param protClass a data frame containing profiles associated
#' with either spectra or peptides (see Tutorial 6)
#' @param outlierLevel 'peptide' for outlier spectra within peptides, or
#' 'protein' for outlier peptides within proteins
#' @param numRefCols number of columns (variables) before data columns
#' @param numDataCols number of fractions in each profile
#' @param outlierMeth boxplot (recommended), scores, or
#' none (if no outliers are to be reported)
#' @param range the range parameter used for identifying outliers
#' using the boxplot method
#' @param proba probability to exclude outlier for scores method
#' @param eps small value to add so that log argument is greater than zero
#' @param randomError TRUE if allow it to be random
#' @param setSeed seed for random number generator (deprecated)
#' @param set.seed seed for random number generator (deprecated)
#' @param cpus NULL (default); deprecated
#' Use BiocParallel with SnowParm or other
#' multiprocessor method to set number of processors
#' See examples for how to specify the number of processors
#' @param multiprocess FALSE by default
#' @return New data frame with an additional column that indicates
#' the number of fractions in a profile (spectra or peptide)
#' that are outliers
#' @examples
#' set.seed(17356) # this works if multiprocess=FALSE
#' eps <- 0.029885209
#' data(spectraNSA_test)
#' flagSpectraBox <- outlierFind(protClass=spectraNSA_test,
#' outlierLevel='peptide', numRefCols=5,
#' numDataCols=9,
#' outlierMeth='boxplot', range=3, eps=eps,
#' randomError=TRUE, multiprocess=FALSE)
#'
#' # examine breakdown of spectral according to the number of fractions
#' # in their profiles that are outliers
#' table(flagSpectraBox$outlier.num.spectra)
#' # Now use multiple processors by specifying "multiprocess=TRUE";
#' # The actual number of cpus is defined by "workers" in SnowParam
#' # A random number seed may be specified by "RNGseed" in SnowParam
#' snowParam <- BiocParallel::SnowParam(workers = 2, RNGseed=1423)
#' #
#' # now modifiy the existing BiocParallelParam
#' BiocParallel::register(snowParam, default=FALSE)
#' flagSpectraBoxM <- outlierFind(protClass=spectraNSA_test,
#' outlierLevel='peptide', numRefCols=5,
#' numDataCols=9,
#' outlierMeth='boxplot', range=3, eps=eps,
#' randomError=TRUE, multiprocess=TRUE)
#' table(flagSpectraBoxM$outlier.num.spectra)
#'
#' @importFrom BiocParallel bplapply
#' @importFrom BiocParallel SnowParam
#' @export outlierFind
outlierFind <- function(protClass, outlierLevel = "peptide",
numRefCols = 5, numDataCols = 9, outlierMeth = "boxplot",
range = 3, proba = 0.99, eps = eps, randomError = TRUE,
setSeed=NULL, set.seed=NULL, cpus=NULL, multiprocess=FALSE) {
if (!is.null(setSeed)) message("setSeed is deprecated")
if (!is.null(set.seed)) message("set.seed is deprecated")
if (!is.null(cpus)) message("cpus is deprecated; use multiprocess")
if (outlierLevel == "peptide")
uniqueLabel <- protClass$pepId
if (outlierLevel == "protein")
uniqueLabel <- protClass$protId
indList <- unique(uniqueLabel)
# if (cpus > 1) {
if (multiprocess) {
result <- bplapply(indList, outlierFind_i,
protClass = protClass, outlierLevel = outlierLevel,
numRefCols = numRefCols, numDataCols = numDataCols,
outlierMeth = outlierMeth, range = range,
proba = proba, eps = eps, randomError = randomError,
BPPARAM = BiocParallel::bpparam())
}
# if (cpus == 1) {
if (!multiprocess) {
result <- lapply(indList, outlierFind_i,
protClass = protClass, outlierLevel = outlierLevel,
numRefCols = numRefCols, numDataCols = numDataCols,
outlierMeth = outlierMeth, range = range,
proba = proba, eps = eps, randomError = randomError)
}
protsCombineCnew <- do.call(what = "rbind", result)
# convert list of matrices to one matrix
protsCombineCnew
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.