Nothing
#' group MS/MS precursor masses
#'
#' @description hierarchically cluster ms/ms precursor scans within and across
#' samples, according to a m/z and retention time error.
#' @return a list identical to adductSpectra containing an additional list
#' element:
#' @param adductSpectra AdductSpec object
#' @param nCores numeric the number of cores to use for parallel computation.
#' The default is to use all available cores detected using the function
#' parallel::detectCores()
#' @param maxRtDrift numeric for the maximum rentention time
#' drift to be considered. Default is 20.
#' @param ms1mzError numeric maximum MS1 mass:charge error
#' @param ms2mzError numeric maximum MS2 mass:charge error
#' @param dotProdClust logical remove previous dot prod clustering results
#' @param minDotProd numeric. Minimum mean dot product spectral similarity
#' score to keep a spectrum within an MS/MS group (default = 0.8).
#' @param disMetric metric to use for distance in clustering
#' @param fclustMethod method to use for the fclust function
#' @param compSpecGen logical for whether composite spectra generation
#' is necessary
#' @usage ms2Group(adductSpectra = NULL, nCores = NULL, maxRtDrift = NULL,
#' ms1mzError = 0.1, ms2mzError = 1, dotProdClust = TRUE, minDotProd = 0.8,
#' fclustMethod = "median", disMetric = "euclidean", compSpecGen = TRUE,
#' adjPrecursorMZ = TRUE)
#' @param adjPrecursorMZ logical for precursor mass:charge adjustment
#'
ms2Group <-
function(adductSpectra = NULL,
nCores = NULL,
maxRtDrift = NULL,
ms1mzError = 0.1,
ms2mzError = 1,
dotProdClust = TRUE,
minDotProd = 0.8,
fclustMethod = "median",
disMetric = "euclidean",
compSpecGen = TRUE,
adjPrecursorMZ = TRUE) {
# error handling
if (is.null(adductSpectra)) {
stop("argument adductSpectra is missing with no default")
} else if (!is(adductSpectra, 'AdductSpec')) {
stop("argument adductSpectra is not an AdductSpec class object.")
}
# check if the parameters are the same and stop if re-grouping
# unneccessary
if (ncol(Parameters(adductSpectra)) > 0) {
if (all(c("maxRtDrift", "ms1mzError", "dotProdClust") %in%
colnames(Parameters(adductSpectra)))) {
if (!is.null(maxRtDrift)) {
if (Parameters(adductSpectra)$maxRtDrift ==
maxRtDrift &&
Parameters(adductSpectra)$ms1mzError ==
ms1mzError &&
Parameters(adductSpectra)[, 'dotProdClust'] ==
FALSE)
{
stop(
"Grouping and composite spectrum generation is
unnecessary as maxRtDrift and ms1mzError
arguments are un-changed from
the previous round of grouping...\n"
)
}
}
}
}
# empty peptide identification results slots
specPepMatches(adductSpectra) <- vector("list", 0)
sumAdductType(adductSpectra) <- data.frame()
Peptides(adductSpectra) <- data.frame()
metaDataTmp <- metaData(adductSpectra)
metaDataTmp$orderTmp <- seq_len(nrow(metaDataTmp))
indxTmp <- metaDataTmp$aboveMinPeaks == 1
metaDataTmp <- metaDataTmp[indxTmp,]
# all spectra 1 list
allSpectra <-
unlist(adductMS2spec(adductSpectra), recursive =
FALSE)
# check if mz error has changed if not then do not re-do
# hierarchical clustering of mass
hclustMass <- TRUE
if (!is.null(Parameters(adductSpectra)$ms1mzError)) {
hclustMass <- ifelse(Parameters(adductSpectra)$ms1mzError ==
ms1mzError,
FALSE,
TRUE)
}
if (hclustMass == TRUE) {
message(
"Hierarchically clustering ",
prettyNum(nrow(metaDataTmp),
big.mark = ","),
" MS/MS precursors and cutting dendrogram according
to a mass error of ",
ms1mzError,
"...\n"
)
# if necessary adjust the mass drift
if ({
"adjPrecursorMZ" %in% colnames(metaDataTmp)
} == FALSE) {
adjPrecursorMZ <- FALSE
}
if (adjPrecursorMZ == TRUE) {
# add ppm mass drift value
precursorMZs <- metaDataTmp$precursorMZ
} else {
precursorMZs <- metaDataTmp$precursorMZ
}
hr <- fastcluster::hclust.vector(precursorMZs,
metric = disMetric,
method = fclustMethod)
# m/z error
metaDataTmp$msPrecursor_group <- 0
metaDataTmp$msPrecursor_group <-
cutree(hr, h = ms1mzError)
}
# order by msPrecursor group
orderMsPreC <- order(metaDataTmp$msPrecursor_group)
metaDataTmp <- metaDataTmp[orderMsPreC,]
allSpectra <- allSpectra[orderMsPreC]
# adjust RT if necessary adjust with intStd
if (!is.null(metaDataTmp$intStdRtDrift)) {
retentionTime <- as.numeric(metaDataTmp$retentionTime) +
(as.numeric(metaDataTmp$intStdRtDrift) *
-1)
} else {
retentionTime <- as.numeric(metaDataTmp$retentionTime)
}
if (length(rtDevModels(adductSpectra)) > 0) {
message("Adjusting retention time based on loess model...")
metaDataTmp$predRtLoess <- 0
for (x in seq_along(Specfile.paths(adductSpectra))) {
indxFileTmp <- which(metaDataTmp$mzXMLFile %in%
basename(Specfile.paths(
adductSpectra))[x])
metaDataTmp$predRtLoess[indxFileTmp] <-
retentionTime[indxFileTmp] -
(predict(
rtDevModels(adductSpectra)[[x]],
newdata =
as.numeric(retentionTime[indxFileTmp]) / 60
) * 60)
}
retentionTime <- as.numeric(metaDataTmp$predRtLoess)
}
# if unneccessary do not re do retention time clustering
hclustRt <- TRUE
if (!is.null(Parameters(adductSpectra)$maxRtDrift)) {
if (!is.null(maxRtDrift)) {
hclustRt <- ifelse(Parameters(adductSpectra)[
, 'maxRtDrift'] == maxRtDrift,
FALSE,
TRUE)
} else {
hclustRt <- FALSE
}
}
if (hclustRt) {
# matching ms/ms scans across samples
message(
"clustering ms/ms scans across samples based on a
maximum RT drift of ",
maxRtDrift,
".\n"
)
interMSMSrtGroups <- tapply(retentionTime,
metaDataTmp$msPrecursor_group,
function(x) {
if (length(x) > 1) {
cutree(
fastcluster::hclust.vector(x, metric =
disMetric,
method = fclustMethod),
h = maxRtDrift
)
} else {
1
}
})
# interMSMS groups
metaDataTmp$interMSMSrtGroups <-
paste0(metaDataTmp$msPrecursor_group,
"_",
unlist(interMSMSrtGroups))
message(
length(unique(metaDataTmp$interMSMSrtGroups)),
" MS/MS peak groups identified at maxRtDrift of ",
maxRtDrift,
" seconds"
)
}
if (dotProdClust == TRUE)
{
# spectral similarity all ms/ms groups # order by
#adduct MS2 file
#names
spectrumIds <- paste0(metaDataTmp$mzXMLFile,
".MS2spectra",
".",
metaDataTmp$seqNum)
allSpectra <- allSpectra[spectrumIds]
# remove previous dot prod clustering results
metaDataTmp$interMSMSrtGroups <-
gsub("\\..+", "",
metaDataTmp$interMSMSrtGroups)
# intra-MSMS group spectral similarity (dot product)
# clustering
message(
"intra-MS/MS group (n=",
length(unique(metaDataTmp$interMSMSrtGroups)),
" RT + m/z groups) spectral similarity
(dot product) clustering...\n"
)
specByGroup <- split(allSpectra,
metaDataTmp$interMSMSrtGroups)
spectraList <- NULL
groupName <- NULL
dotProdSimClust <-
function(spectraList = NULL,
groupName = NULL,
minDotProdThresh = minDotProd,
binSizeMS2 = 1) {
# if length 1 return nothing
clusterId <- groupName
if (length(spectraList) > 1) {
specNamesVecTmp <- rep(names(spectraList),
vapply(spectraList,
nrow, FUN.VALUE = numeric(1)))
allSpecTmp <-
do.call(rbind, spectraList)
maxMass <-
floor(max(allSpecTmp[, 1])) + 10
# padded integer labels
message("Calculating dot product matrix ",
length(unique(specNamesVecTmp)),
" spectra\n")
# padded integer labels
labelsTmp <-
paste0(
"(",
seq(binSizeMS2,
(maxMass - binSizeMS2),
binSizeMS2),
",",
seq((2 * binSizeMS2),
maxMass, binSizeMS2),
"]"
)
massBinsIndivTmp <-
cut(
allSpecTmp[, 1],
breaks = seq(binSizeMS2,
maxMass, binSizeMS2),
labels = labelsTmp
)
# empty bins
indivSpecVec <-
tapply(allSpecTmp[, 2],
paste0(specNamesVecTmp,
massBinsIndivTmp),
sum)
# identify any absent bins
allBinNames <-
paste0(rep(unique(specNamesVecTmp),
each = length(labelsTmp)),
rep(labelsTmp, length(
unique(specNamesVecTmp)
)))
# add absent bins as zeros
allBinsTmp <-
rep(0, length(allBinNames))
names(allBinsTmp) <- allBinNames
# ensure indivSpecVec is in right order
allBinsTmp[match(names(indivSpecVec),
allBinNames)] <-
indivSpecVec
indivSpecMat <-
matrix(allBinsTmp,
byrow =
FALSE,
nrow = length(labelsTmp))
# mean all pairwise dotproducts
dotProdMat <-
crossprod(indivSpecMat)
sqrtMatrixTmp <-
matrix(
sqrt(colSums(indivSpecMat ^ 2)),
nrow = nrow(dotProdMat),
ncol = ncol(dotProdMat),
byrow = TRUE
)
dotProdsTmp <-
dotProdMat / (sqrtMatrixTmp *
diag(sqrtMatrixTmp))
hr <-
fastcluster::hclust(as.dist(1 -
dotProdsTmp),
method = "single")
clusterId <- cutree(hr, h = {
1 - minDotProdThresh
})
clusterId <-
paste0(groupName, ".",
clusterId)
}
return(clusterId)
} # end dot product hierarchical clustering
nCores <- ifelse(is.null(nCores), 1, nCores)
if (nCores > 1) {
# multi-threaded
pmt <- proc.time()
message(paste0(
"Starting SNOW cluster with ",
nCores,
" local sockets...\n"
))
cl <- parallel::makeCluster(nCores)
doSNOW::registerDoSNOW(cl)
dotProdResults <-
foreach(
spectraList =
specByGroup,
groupName = names(specByGroup),
.combine = "c",
.packages = c("fastcluster")
) %dopar% {
dotProdSimClust(spectraList, groupName,
binSizeMS2 =
ms2mzError)
}
# stop SNOW cluster
parallel::stopCluster(cl)
proc.time() - pmt
# sort dot prod results to reinsert in correct
# order
indxSpecTmp <-
match(do.call(c, lapply(specByGroup, names)),
spectrumIds)
dotProdResults[indxSpecTmp] <-
dotProdResults
} else {
dotProdResults <- vector("character",
nrow(metaDataTmp))
for (i in seq_along(specByGroup)) {
message(i,
" of ",
length(specByGroup),
" m/z + RT groups.\n")
dotProdResTmp <-
dotProdSimClust(
spectraList =
specByGroup[[i]],
groupName = names(specByGroup)[i],
binSizeMS2 = ms2mzError
)
indxSpecTmp <- match(names(specByGroup[[i]]), spectrumIds)
dotProdResults[indxSpecTmp] <-
dotProdResTmp
}
}
# intra-MSMS group spectral similarity (dot
# product) clustering
metaDataTmp$interMSMSrtGroups <-
dotProdResults
message(
prettyNum(length(
unique(metaDataTmp$interMSMSrtGroups)
) - 1,
big.mark = ","),
" inter-MS/MS groups
following spectral
similarity (dot product) clustering...\n"
)
}
if (compSpecGen == TRUE)
{
# order by adduct MS2 file names
spectrumIds <-
paste0(metaDataTmp$mzXMLFile,
".MS2spectra",
".",
metaDataTmp$seqNum)
allSpectra <-
allSpectra[spectrumIds]
compSpecTable <- table(metaDataTmp$interMSMSrtGroups)
message("Generating ",
sum(compSpecTable > 1),
" composite spectra...\n")
# if nCores not null then parallel comp
if (!is.null(nCores)) {
# all single spectra groups first
interMSMScompSpec <-
vector("list",
length(unique(
metaDataTmp$interMSMSrtGroups
)))
names(interMSMScompSpec) <-
unique(metaDataTmp$interMSMSrtGroups)
singleGroupIndx <- match(names(compSpecTable)[
compSpecTable == 1],
metaDataTmp$interMSMSrtGroups)
listIndx <- match(metaDataTmp$interMSMSrtGroups[
singleGroupIndx],names(interMSMScompSpec))
interMSMScompSpec[listIndx] <-
allSpectra[paste0(metaDataTmp$mzXMLFile[singleGroupIndx],
".MS2spectra",
".",
metaDataTmp$seqNum[singleGroupIndx])]
multSampNames <-
names(compSpecTable)[compSpecTable > 1]
# mult sample index
multSampIndx <- match(names(interMSMScompSpec), multSampNames)
message(paste0(
"Starting SNOW cluster
with ",
nCores,
" local sockets...\n"
))
cl <-
parallel::makeCluster(nCores)
doSNOW::registerDoSNOW(cl)
compSpecTmp <-
foreach(j = multSampNames,
.packages = c("fastcluster",
"adductomics")) %dopar% {
subSpecNamesTmp <- spectrumIds[metaDataTmp$interMSMSrtGroups
%in% j]
spec.df <-
do.call(rbind, allSpectra[subSpecNamesTmp])
return(signalGrouping(spec.df,
ms2mzError, minPeaks = 0))
}
# stop SNOW cluster
parallel::stopCluster(cl)
interMSMScompSpec[which(!is.na(multSampIndx))] <-
compSpecTmp[multSampIndx[!is.na(multSampIndx)]]
} else {
pmt <- proc.time()
interMSMScompSpec <-
vector("list",
length(unique(
metaDataTmp$interMSMSrtGroups
)))
names(interMSMScompSpec) <-
unique(metaDataTmp$interMSMSrtGroups)
singleGroupIndx <- match(names(compSpecTable)[compSpecTable == 1],
metaDataTmp$interMSMSrtGroups)
listIndx <- match(metaDataTmp$interMSMSrtGroups[singleGroupIndx],
names(interMSMScompSpec))
interMSMScompSpec[listIndx] <-
allSpectra[paste0(metaDataTmp$mzXMLFile[singleGroupIndx],
".MS2spectra",
".",
metaDataTmp$seqNum[singleGroupIndx])]
compSpecIndx <-
names(compSpecTable)[compSpecTable > 1]
pb <- txtProgressBar(
min = 0,
max = length(compSpecIndx),
style = 3
)
for (j in seq_along(compSpecIndx)) {
setTxtProgressBar(pb, j)
intGroup <- compSpecIndx[j]
listIndxTmp <- which(names(interMSMScompSpec) == intGroup)
subSpecNamesTmp <-
spectrumIds[metaDataTmp$interMSMSrtGroups
%in% intGroup]
spec.df <-
do.call(rbind, allSpectra[subSpecNamesTmp])
interMSMScompSpec[[listIndxTmp]] <-
signalGrouping(spec.df,
ms2mzError,
minPeaks = 0)
}
proc.time() - pmt
} # nCores
# add inter sample spectra to adductspectra
groupMS2spec(adductSpectra) <-
lapply(interMSMScompSpec,
function(subL) {
colnames(subL) <- c("mass", "intensity")
return(data.frame(subL, stringsAsFactors =
FALSE))
})
}
metaDataTmp <- metaDataTmp[order(metaDataTmp$orderTmp),]
metaDataTmp$orderTmp <- NULL
# add mod metaData back to AdductSpec object
# add additional columns if necc
missCol <-
setdiff(colnames(metaDataTmp),
colnames(metaData(adductSpectra)))
metaData(adductSpectra)[, missCol] <-
""
nameIndx <-
match(colnames(metaDataTmp),
colnames(metaData(adductSpectra)))
metaData(adductSpectra) <-
metaData(adductSpectra)[, nameIndx]
metaData(adductSpectra)[indxTmp,] <-
metaDataTmp
# add parameters to parameters slot
argsTmp <-
c(
"maxRtDrift",
"ms1mzError",
"ms2mzError",
"dotProdClust",
"minDotProd",
"fclustMethod",
"disMetric"
)
paramTmp <-
data.frame(
ifelse(!is.null(maxRtDrift), maxRtDrift, "NULL"),
ms1mzError,
ms2mzError,
dotProdClust,
minDotProd,
fclustMethod,
disMetric,
stringsAsFactors = FALSE
)
colnames(paramTmp) <- argsTmp
if (ncol(Parameters(adductSpectra))==0) {
Parameters(adductSpectra) <- paramTmp
} else {
Parameters(adductSpectra)[, argsTmp] <-
paramTmp
}
return(adductSpectra)
} # end function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.