## Correspondence functions.
#' @include functions-Params.R
#' @title Core API function for peak density based chromatographic peak
#' grouping
#'
#' @description
#'
#' The `do_groupChromPeaks_density` function performs chromatographic peak
#' grouping based on the density (distribution) of peaks, found in different
#' samples, along the retention time axis in slices of overlapping mz ranges.
#'
#' @details For overlapping slices along the mz dimension, the function
#' calculates the density distribution of identified peaks along the
#' retention time axis and groups peaks from the same or different samples
#' that are close to each other. See (Smith 2006) for more details.
#'
#' @note The default settings might not be appropriate for all LC/GC-MS setups,
#' especially the `bw` and `binSize` parameter should be adjusted
#' accordingly.
#'
#' @param peaks A `matrix` or `data.frame` with the mz values and
#' retention times of the identified chromatographic peaks in all samples of an
#' experiment. Required columns are `"mz"`, `"rt"` and
#' `"sample"`. The latter should contain `numeric` values representing
#' the index of the sample in which the peak was found.
#'
#' @inheritParams groupChromPeaks-density
#'
#' @param sleep `numeric(1)` defining the time to *sleep* between
#' iterations and plot the result from the current iteration.
#'
#' @return
#'
#' A `data.frame`, each row representing a (mz-rt) feature (i.e. a peak group)
#' with columns:
#'
#' - `"mzmed"`: median of the peaks' apex mz values.
#' - `"mzmin"`: smallest mz value of all peaks' apex within the feature.
#' - `"mzmax"`:largest mz value of all peaks' apex within the feature.
#' - `"rtmed"`: the median of the peaks' retention times.
#' - `"rtmin"`: the smallest retention time of the peaks in the group.
#' - `"rtmax"`: the largest retention time of the peaks in the group.
#' - `"npeaks"`: the total number of peaks assigned to the feature.
#' - `"peakidx"`: a `list` with the indices of all peaks in a feature in the
#' `peaks` input matrix.
#'
#' Note that this number can be larger than the total number of samples, since
#' multiple peaks from the same sample could be assigned to a feature.
#'
#' @references
#'
#' Colin A. Smith, Elizabeth J. Want, Grace O'Maille, Ruben Abagyan and
#' Gary Siuzdak. "XCMS: Processing Mass Spectrometry Data for Metabolite
#' Profiling Using Nonlinear Peak Alignment, Matching, and Identification"
#' Anal. Chem. 2006, 78:779-787.
#'
#' @author Colin Smith, Johannes Rainer
#'
#' @family core peak grouping algorithms
#'
#' @md
#'
#' @examples
#' ## Load the test data set
#' library(faahKO)
#' data(faahko)
#'
#' ## Extract the matrix with the identified peaks from the xcmsSet:
#' fts <- peaks(faahko)
#'
#' ## Perform the peak grouping with default settings:
#' res <- do_groupChromPeaks_density(fts, sampleGroups = sampclass(faahko))
#'
#' ## The feature definitions:
#' head(res$featureDefinitions)
#'
#' ## The assignment of peaks from the input matrix to the features
#' head(res$peakIndex)
do_groupChromPeaks_density <- function(peaks, sampleGroups,
bw = 30, minFraction = 0.5, minSamples = 1,
binSize = 0.25, maxFeatures = 50,
sleep = 0) {
if (missing(sampleGroups))
stop("Parameter 'sampleGroups' is missing! This should be a vector of ",
"length equal to the number of samples specifying the group ",
"assignment of the samples.")
if (missing(peaks))
stop("Parameter 'peaks' is missing!")
if (!(is.matrix(peaks) | is.data.frame(peaks)))
stop("'peaks' has to be a 'matrix' or a 'data.frame'!")
## Check that we've got all required columns
.reqCols <- c("mz", "rt", "sample")
if (sleep > 0)
.reqCols <- c(.reqCols, "into")
if (!all(.reqCols %in% colnames(peaks)))
stop("Required columns ",
paste0("'", .reqCols[!.reqCols %in% colnames(peaks)],"'",
collapse = ", "), " not found in 'peaks' parameter")
sampleGroups <- as.character(sampleGroups)
sampleGroupNames <- unique(sampleGroups)
sampleGroupTable <- table(sampleGroups)
nSampleGroups <- length(sampleGroupTable)
## Check that sample groups matches with sample column.
if (max(peaks[, "sample"]) > length(sampleGroups))
stop("Sample indices in 'peaks' are larger than there are sample",
" groups specified with 'sampleGroups'!")
peaks <- cbind(peaks[, .reqCols, drop = FALSE],
index = seq_len(nrow(peaks)))
## Order peaks matrix by mz
peaks <- peaks[order(peaks[, "mz"]), , drop = FALSE]
rownames(peaks) <- NULL
rtRange <- range(peaks[, "rt"])
## Define the mass slices and the index in the peaks matrix with an mz
## value >= mass[i].
mass <- seq(peaks[1, "mz"], peaks[nrow(peaks), "mz"] + binSize,
by = binSize / 2)
masspos <- findEqualGreaterM(peaks[, "mz"], mass)
densFrom <- rtRange[1] - 3 * bw
densTo <- rtRange[2] + 3 * bw
## Increase the number of sampling points for the density distribution.
densN <- max(512, 2 * 2^(ceiling(log2(diff(rtRange) / (bw / 2)))))
endIdx <- 0
message("Processing ", length(mass) - 1, " mz slices ... ",
appendLF = FALSE)
resL <- vector("list", (length(mass) - 2))
for (i in seq_len(length(mass)-2)) {
## That's identifying overlapping mz slices.
startIdx <- masspos[i]
endIdx <- masspos[i + 2] - 1
if (endIdx - startIdx < 0)
next
resL[[i]] <- .group_peaks_density(peaks[startIdx:endIdx, , drop = FALSE],
bw = bw, densFrom = densFrom,
densTo = densTo, densN = densN,
sampleGroups = sampleGroups,
sampleGroupTable = sampleGroupTable,
minFraction = minFraction,
minSamples = minSamples,
maxFeatures = maxFeatures,
sleep = sleep)
}
message("OK")
res <- do.call(rbind, resL)
if (nrow(res)) {
## Remove groups that overlap with more "well-behaved" groups
numsamp <- rowSums(
as.matrix(res[, (match("npeaks", colnames(res)) +1):(ncol(res) -1),
drop = FALSE]))
uorder <- order(-numsamp, res[, "npeaks"])
uindex <- rectUnique(
as.matrix(res[, c("mzmin", "mzmax", "rtmin", "rtmax"),
drop = FALSE]), uorder)
res <- res[uindex, , drop = FALSE]
rownames(res) <- NULL
}
res
}
## do_groupChromPeaks_density_old <- function(peaks, sampleGroups,
## bw = 30, minFraction = 0.5,
## minSamples = 1,
## binSize = 0.25, maxFeatures = 50,
## sleep = 0) {
## if (missing(sampleGroups))
## stop("Parameter 'sampleGroups' is missing! This should be a vector of ",
## "length equal to the number of samples specifying the group ",
## "assignment of the samples.")
## if (missing(peaks))
## stop("Parameter 'peaks' is missing!")
## if (!is.matrix(peaks) | is.data.frame(peaks))
## stop("'peaks' has to be a 'matrix' or a 'data.frame'!")
## ## Check that we've got all required columns
## .reqCols <- c("mz", "rt", "sample")
## if (sleep > 0)
## .reqCols <- c(.reqCols, "into")
## if (!all(.reqCols %in% colnames(peaks)))
## stop("Required columns ",
## paste0("'", .reqCols[!.reqCols %in% colnames(peaks)],"'",
## collapse = ", "), " not found in 'peaks' parameter")
## sampleGroups <- as.character(sampleGroups)
## sampleGroupNames <- unique(sampleGroups)
## sampleGroupTable <- table(sampleGroups)
## nSampleGroups <- length(sampleGroupTable)
## ## Check that sample groups matches with sample column.
## if (max(peaks[, "sample"]) > length(sampleGroups))
## stop("Sample indices in 'peaks' are larger than there are sample",
## " groups specified with 'sampleGroups'!")
## ## Order peaks matrix by mz
## peakOrder <- order(peaks[, "mz"])
## peaks <- peaks[peakOrder, .reqCols, drop = FALSE]
## rownames(peaks) <- NULL
## rtRange <- range(peaks[, "rt"])
## ## Define the mass slices and the index in the peaks matrix with an mz
## ## value >= mass[i].
## mass <- seq(peaks[1, "mz"], peaks[nrow(peaks), "mz"] + binSize,
## by = binSize / 2)
## masspos <- findEqualGreaterM(peaks[,"mz"], mass)
## groupmat <- matrix(nrow = 512, ncol = 7 + nSampleGroups)
## groupindex <- vector("list", 512)
## densFrom <- rtRange[1] - 3 * bw
## densTo <- rtRange[2] + 3 * bw
## ## Increase the number of sampling points for the density distribution.
## densN <- max(512, 2 * 2^(ceiling(log2(diff(rtRange) / (bw / 2)))))
## endIdx <- 0
## num <- 0
## gcount <- integer(nSampleGroups)
## message("Processing ", length(mass) - 1, " mz slices ... ", appendLF = FALSE)
## for (i in seq_len(length(mass)-2)) {
## ## That's identifying overlapping mz slices.
## startIdx <- masspos[i]
## endIdx <- masspos[i + 2] - 1
## if (endIdx - startIdx < 0)
## next
## curMat <- peaks[startIdx:endIdx, , drop = FALSE]
## den <- density(curMat[, "rt"], bw = bw, from = densFrom, to = densTo,
## n = densN)
## maxden <- max(den$y)
## deny <- den$y
## ## gmat <- matrix(nrow = 5, ncol = 2 + gcount)
## snum <- 0
## ## What's that 20 there?
## while (deny[maxy <- which.max(deny)] > maxden / 20 && snum < maxFeatures) {
## grange <- descendMin(deny, maxy)
## deny[grange[1]:grange[2]] <- 0
## gidx <- which(curMat[,"rt"] >= den$x[grange[1]] &
## curMat[,"rt"] <= den$x[grange[2]])
## ## Determine the sample group of the samples in which the peaks
## ## were detected and check if they correspond to the required limits.
## tt <- table(sampleGroups[unique(curMat[gidx, "sample"])])
## if (!any(tt / sampleGroupTable[names(tt)] >= minFraction &
## tt >= minSamples))
## next
## snum <- snum + 1
## num <- num + 1
## ## Double the size of the output containers if they're full
## if (num > nrow(groupmat)) {
## groupmat <- rbind(groupmat,
## matrix(nrow = nrow(groupmat),
## ncol = ncol(groupmat)))
## groupindex <- c(groupindex, vector("list", length(groupindex)))
## }
## gcount <- rep(0, length(sampleGroupNames))
## names(gcount) <- sampleGroupNames
## gcount[names(tt)] <- as.numeric(tt)
## groupmat[num, 1] <- median(curMat[gidx, "mz"])
## groupmat[num, 2:3] <- range(curMat[gidx, "mz"])
## groupmat[num, 4] <- median(curMat[gidx, "rt"])
## groupmat[num, 5:6] <- range(curMat[gidx, "rt"])
## groupmat[num, 7] <- length(gidx)
## groupmat[num, 7 + seq(along = gcount)] <- gcount
## groupindex[[num]] <- sort(peakOrder[(startIdx:endIdx)[gidx]])
## }
## if (sleep > 0) {
## ## Plot the density
## plot(den, main = paste(round(min(curMat[,"mz"]), 2), "-",
## round(max(curMat[,"mz"]), 2)))
## ## Highlight peaks per sample group.
## for (j in 1:nSampleGroups) {
## ## Which peaks belong to this sample group.
## cur_group_samples <- which(sampleGroups == sampleGroupNames[j])
## idx <- curMat[, "sample"] %in% cur_group_samples
## points(curMat[idx, "rt"], curMat[idx, "into"] /
## max(curMat[, "into"]) * maxden,
## col = j, pch=20)
## }
## for (j in seq(length = snum))
## abline(v = groupmat[num - snum + j, 5:6], lty = "dashed", col = j)
## Sys.sleep(sleep)
## }
## }
## message("OK")
## colnames(groupmat) <- c("mzmed", "mzmin", "mzmax", "rtmed", "rtmin", "rtmax",
## "npeaks", sampleGroupNames)
## groupmat <- groupmat[seq_len(num), , drop = FALSE]
## groupindex <- groupindex[seq_len(num)]
## ## Remove groups that overlap with more "well-behaved" groups
## numsamp <- rowSums(groupmat[, (match("npeaks",
## colnames(groupmat))+1):ncol(groupmat),
## drop = FALSE])
## uorder <- order(-numsamp, groupmat[, "npeaks"])
## uindex <- rectUnique(groupmat[, c("mzmin","mzmax","rtmin","rtmax"),
## drop = FALSE],
## uorder)
## return(list(featureDefinitions = groupmat[uindex, , drop = FALSE],
## peakIndex = groupindex[uindex]))
## }
#' @title Core API function for peak grouping using mzClust
#'
#' @description
#'
#' The `do_groupPeaks_mzClust` function performs high resolution
#' correspondence on single spectra samples.
#'
#' @inheritParams groupChromPeaks-density
#'
#' @inheritParams do_groupChromPeaks_density
#'
#' @inheritParams groupChromPeaks-mzClust
#'
#' @return A `list` with elements `"featureDefinitions"` and
#' `"peakIndex"`. `"featureDefinitions"` is a `matrix`, each row
#' representing an (mz-rt) feature (i.e. peak group) with columns:
#' - `"mzmed"`: median of the peaks' apex mz values.
#' - `"mzmin"`: smallest mz value of all peaks' apex within the feature.
#' - `"mzmax"`: largest mz value of all peaks' apex within the feature.
#' - `"rtmed"`: always `-1`.
#' - `"rtmin"`: always `-1`.
#' - `"rtmax"`: always `-1`.
#' - `"npeaks"`: the total number of peaks assigned to the feature. Note that
#' this number can be larger than the total number of samples, since
#' multiple peaks from the same sample could be assigned to a group.
#'
#' `"peakIndex"` is a `list` with the indices of all peaks in a peak group in
#' the `peaks` input matrix.
#'
#' @md
#'
#' @family core peak grouping algorithms
#'
#' @references Saira A. Kazmi, Samiran Ghosh, Dong-Guk Shin, Dennis W. Hill
#' and David F. Grant\cr \emph{Alignment of high resolution mass spectra:
#' development of a heuristic approach for metabolomics}.\cr Metabolomics,
#' Vol. 2, No. 2, 75-83 (2006)
do_groupPeaks_mzClust <- function(peaks, sampleGroups, ppm = 20,
absMz = 0, minFraction = 0.5,
minSamples = 1) {
if (missing(sampleGroups))
stop("Parameter 'sampleGroups' is missing! This should be a vector of ",
"length equal to the number of samples specifying the group ",
"assignment of the samples.")
if (missing(peaks))
stop("Parameter 'peaks' is missing!")
if (!(is.matrix(peaks) | is.data.frame(peaks)))
stop("Peaks has to be a 'matrix' or a 'data.frame'!")
## Check that we've got all required columns
.reqCols <- c("mz", "sample")
if (!all(.reqCols %in% colnames(peaks)))
stop("Required columns ",
paste0("'", .reqCols[!.reqCols %in% colnames(peaks)],"'",
collapse = ", "), " not found in 'peaks' parameter")
if (!is.factor(sampleGroups))
sampleGroups <- factor(sampleGroups, levels = unique(sampleGroups))
sampleGroupNames <- levels(sampleGroups)
sampleGroupTable <- table(sampleGroups)
nSampleGroups <- length(sampleGroupTable)
##sampleGroups <- as.numeric(sampleGroups)
## Check that sample groups matches with sample column.
if (max(peaks[, "sample"]) > length(sampleGroups))
stop("Sample indices in 'peaks' are larger than there are sample",
" groups specified with 'sampleGroups'!")
peaks <- .fix_mz_clust_peaks(peaks)
grps <- mzClustGeneric(peaks[, .reqCols, drop = FALSE],
sampclass = sampleGroups,
mzppm = ppm,
mzabs = absMz,
minsamp = minSamples,
minfrac = minFraction)
grpmat <- grps$mat
if (is.null(nrow(grpmat))) {
matColNames <- names(grpmat)
grpmat <- matrix(grpmat, ncol = length(grpmat), byrow = FALSE)
colnames(grpmat) <- matColNames
}
rts <- rep(-1, nrow(grpmat))
cns <- colnames(grpmat)
grpmat <- cbind(grpmat[, 1:3, drop = FALSE], rts, rts, rts,
grpmat[, 4:ncol(grpmat), drop = FALSE])
colnames(grpmat) <- c(cns[1:3], c("rtmed", "rtmin", "rtmax"),
cns[4:length(cns)])
return(list(featureDefinitions = grpmat, peakIndex = grps$idx))
}
#' @title Core API function for chromatic peak grouping using a nearest
#' neighbor approach
#'
#' @description
#'
#' The `do_groupChromPeaks_nearest` function groups peaks across samples by
#' creating a master peak list and assigning corresponding peaks from all
#' samples to each peak group (i.e. feature). The method is inspired by the
#' correspondence algorithm of mzMine (Katajamaa 2006).
#'
#' @inheritParams do_groupChromPeaks_density
#' @inheritParams groupChromPeaks-nearest
#'
#' @return A `list` with elements `"featureDefinitions"` and
#' `"peakIndex"`. `"featureDefinitions"` is a `matrix`, each row
#' representing an (mz-rt) feature (i.e. peak group) with columns:
#'
#' - `"mzmed"`: median of the peaks' apex mz values.
#' - `"mzmin"`: smallest mz value of all peaks' apex within the feature.
#' - `"mzmax"`:largest mz value of all peaks' apex within the feature.
#' - `"rtmed"`: the median of the peaks' retention times.
#' - `"rtmin"`: the smallest retention time of the peaks in the feature.
#' - `"rtmax"`: the largest retention time of the peaks in the feature.
#' - `"npeaks"`: the total number of peaks assigned to the feature.
#'
#' `"peakIndex"` is a `list` with the indices of all peaks in a feature in the
#' `peaks` input matrix.
#'
#' @family core peak grouping algorithms
#'
#' @md
#'
#' @references Katajamaa M, Miettinen J, Oresic M: MZmine: Toolbox for
#' processing and visualization of mass spectrometry based molecular profile
#' data. Bioinformatics 2006, 22:634-636.
do_groupChromPeaks_nearest <- function(peaks, sampleGroups, mzVsRtBalance = 10,
absMz = 0.2, absRt = 15, kNN = 10) {
if (missing(sampleGroups))
stop("Parameter 'sampleGroups' is missing! This should be a vector of ",
"length equal to the number of samples specifying the group ",
"assignment of the samples.")
if (missing(peaks))
stop("Parameter 'peaks' is missing!")
if (!(is.matrix(peaks) | is.data.frame(peaks)))
stop("Peaks has to be a 'matrix' or a 'data.frame'!")
## Check that we've got all required columns
.reqCols <- c("mz", "rt", "sample")
if (!all(.reqCols %in% colnames(peaks)))
stop("Required columns ",
paste0("'", .reqCols[!.reqCols %in% colnames(peaks)],"'",
collapse = ", "), " not found in 'peaks' parameter")
if (!is.factor(sampleGroups))
sampleGroups <- factor(sampleGroups, levels = unique(sampleGroups))
sampleGroupNames <- levels(sampleGroups)
sampleGroupTable <- table(sampleGroups)
nSampleGroups <- length(sampleGroupTable)
## sampleGroups == classlabel
## nSampleGroups == gcount
## peaks == peakmat
peaks <- peaks[, .reqCols, drop = FALSE]
parameters <- list(mzVsRTBalance = mzVsRtBalance, mzcheck = absMz,
rtcheck = absRt, knn = kNN)
ptable <- table(peaks[,"sample"])
pord <- ptable[order(ptable, decreasing = TRUE)]
sid <- as.numeric(names(pord))
pn <- as.numeric(pord)
## environment - probably not a good idea for parallel processing - we
## would like to have data copying there (or better just provide the data
## chunk it needs to process).
mplenv <- new.env(parent = .GlobalEnv)
mplenv$mplist <- matrix(0, pn[1], length(sid))
mplenv$mplist[, sid[1]] <- which(peaks[,"sample"] == sid[1])
mplenv$mplistmean <- data.frame(peaks[which(peaks[,"sample"] == sid[1]),
c("mz", "rt")])
mplenv$peakmat <- peaks
assign("peakmat", peaks, envir = mplenv) ## double assignment?
sapply(sid[2:length(sid)], function(sample, mplenv){
message("Processing sample number ", sample, " ... ", appendLF = FALSE)
## require(parallel)
## cl <- makeCluster(getOption("cl.cores", nSlaves))
## clusterEvalQ(cl, library(RANN))
## parSapply(cl, 2:length(samples), function(sample,mplenv, object){
## might slightly improve on this for loop.
## Calculating for each row (peak) the mean mz or rt for peaks
## assigned yet to this peak group.
for (mml in seq(mplenv$mplist[,1])) {
mplenv$mplistmean[mml, "mz"] <-
mean(mplenv$peakmat[mplenv$mplist[mml, ], "mz"])
mplenv$mplistmean[mml, "rt"] <-
mean(mplenv$peakmat[mplenv$mplist[mml, ], "rt"])
}
mplenv$peakIdxList <- data.frame(
peakidx = which(mplenv$peakmat[, "sample"] == sample),
isJoinedPeak = FALSE
)
if (length(mplenv$peakIdxList$peakidx) == 0)
message("Warning: No peaks in sample number ", sample)
## this really doesn't take a long time not worth parallel version here.
## but make an apply loop now faster even with rearranging the data :D : PB
scoreList <- sapply(mplenv$peakIdxList$peakidx,
function(currPeak, para, mplenv){
xcms:::patternVsRowScore(currPeak, para, mplenv)
}, parameters, mplenv, simplify = FALSE)
scoreList <- do.call(rbind, scoreList)
## Browse scores in order of descending goodness-of-fit
scoreListcurr <- scoreList[order(scoreList[, "score"]), ]
if (nrow(scoreListcurr) > 0) {
for (scoreIter in 1:nrow(scoreListcurr)) {
iterPeak <- scoreListcurr[scoreIter, "peak"]
iterRow <- scoreListcurr[scoreIter, "mpListRow"]
## Check if master list row is already assigned with peak
if (scoreListcurr[scoreIter, "isJoinedRow"] == TRUE)
next
## Check if peak is already assigned to some master list row
if (scoreListcurr[scoreIter, "isJoinedPeak"] == TRUE)
next
## Check if score good enough
## Assign peak to master peak list row
mplenv$mplist[iterRow, sample] <- iterPeak
## Mark peak as joined
setTrue <- which(scoreListcurr[, "mpListRow"] == iterRow)
scoreListcurr[setTrue, "isJoinedRow"] <- TRUE
setTrue <- which(scoreListcurr[, "peak"] == iterPeak)
scoreListcurr[setTrue, "isJoinedPeak"] <- TRUE
mplenv$peakIdxList[which(mplenv$peakIdxList$peakidx == iterPeak),
"isJoinedPeak"] <- TRUE
}
}
notJoinedPeaks <- mplenv$peakIdxList[which(mplenv$peakIdxList$isJoinedPeak == FALSE), "peakidx"]
for (notJoinedPeak in notJoinedPeaks) {
mplenv$mplist <- rbind(mplenv$mplist,
matrix(0, 1, dim(mplenv$mplist)[2]))
mplenv$mplist[length(mplenv$mplist[,1]), sample] <- notJoinedPeak
}
## Clear "Joined" information from all master peaklist rows
rm(list = "peakIdxList", envir = mplenv)
message("OK")
}, mplenv)
groupmat <- matrix( 0, nrow(mplenv$mplist), 7 + nSampleGroups)
colnames(groupmat) <- c("mzmed", "mzmin", "mzmax", "rtmed", "rtmin", "rtmax",
"npeaks", sampleGroupNames)
groupindex <- vector("list", nrow(mplenv$mplist))
## Variable to count samples for a peak
sampCounts <- rep_len(0, nSampleGroups)
names(sampCounts) <- sampleGroupNames
## gcount <- integer(nSampleGroups)
## Can we vectorize that below somehow?
for (i in 1:nrow(mplenv$mplist)) {
groupmat[i, "mzmed"] <- median(peaks[mplenv$mplist[i, ], "mz"])
groupmat[i, c("mzmin", "mzmax")] <- range(peaks[mplenv$mplist[i, ], "mz"])
groupmat[i, "rtmed"] <- median(peaks[mplenv$mplist[i, ], "rt"])
groupmat[i, c("rtmin", "rtmax")] <- range(peaks[mplenv$mplist[i, ], "rt"])
groupmat[i, "npeaks"] <- length(which(peaks[mplenv$mplist[i, ]] > 0))
## Now summarizing the number of samples in which the peak was identified
sampCounts[] <- 0
tbl <- table(sampleGroups[peaks[mplenv$mplist[i, ], "sample"]])
sampCounts[names(tbl)] <- as.numeric(tbl)
groupmat[i, 7 + seq_len(nSampleGroups)] <- sampCounts
## gnum <- sampleGroups[unique(peaks[mplenv$mplist[i, ], "sample"])]
## for (j in seq(along = gcount))
## gcount[j] <- sum(gnum == j)
## groupmat[i, 7 + seq(along = gcount)] <- gcount
groupindex[[i]] <- mplenv$mplist[i, (which(mplenv$mplist[i,]>0))]
}
return(list(featureDefinitions = groupmat, peakIndex = groupindex))
}
#' Low level function to group chromatographic peaks within a m/z slice.
#'
#' @param x `matrix` such as the one returned by `chromPeaks,XCMSnExp`, just
#' with the peaks within one m/z slice. Note that we require in addition
#' a column `"index"` with the index of the peak within the full peak table.
#'
#' @param return `data.frame`
#'
#' @author Johannes Rainer
#'
#' @noRd
.group_peaks_density <- function(x, bw, densFrom, densTo, densN, sampleGroups,
sampleGroupTable, minFraction,
minSamples, maxFeatures, sleep = 0) {
den <- density(x[, "rt"], bw = bw, from = densFrom, to = densTo,
n = densN)
maxden <- max(den$y)
deny <- den$y
sampleGroupNames <- names(sampleGroupTable)
nSampleGroups <- length(sampleGroupNames)
col_nms <- c("mzmed", "mzmin", "mzmax", "rtmed", "rtmin", "rtmax",
"npeaks", sampleGroupNames)
res_mat <- matrix(nrow = 0, ncol = length(col_nms),
dimnames = list(character(), col_nms))
res_idx <- list()
while (deny[maxy <- which.max(deny)] > maxden / 20 && nrow(res_mat) <
maxFeatures) {
grange <- descendMin(deny, maxy)
deny[grange[1]:grange[2]] <- 0
gidx <- which(x[,"rt"] >= den$x[grange[1]] &
x[,"rt"] <= den$x[grange[2]])
## Determine the sample group of the samples in which the peaks
## were detected and check if they correspond to the required limits.
tt <- table(sampleGroups[unique(x[gidx, "sample"])])
if (!any(tt / sampleGroupTable[names(tt)] >= minFraction &
tt >= minSamples))
next
gcount <- rep(0, length(sampleGroupNames))
names(gcount) <- sampleGroupNames
gcount[names(tt)] <- as.numeric(tt)
res_mat <- rbind(res_mat,
c(median(x[gidx, "mz"]),
range(x[gidx, "mz"]),
median(x[gidx, "rt"]),
range(x[gidx, "rt"]),
length(gidx),
gcount)
)
res_idx <- c(res_idx, list(unname(sort(x[gidx, "index"]))))
}
if (sleep > 0) {
## Plot the density
plot(den, main = paste(round(min(x[,"mz"]), 2), "-",
round(max(x[,"mz"]), 2)))
## Highlight peaks per sample group.
for (j in seq_len(nSampleGroups)) {
## Which peaks belong to this sample group.
cur_group_samples <- which(sampleGroups == sampleGroupNames[j])
idx <- x[, "sample"] %in% cur_group_samples
points(x[idx, "rt"], x[idx, "into"] /
max(x[, "into"]) * maxden,
col = j, pch=20)
}
for (j in seq_len(nrow(res_mat)))
abline(v = res_mat[j, 5:6], lty = "dashed", col = j)
Sys.sleep(sleep)
}
res <- as.data.frame(res_mat)
res$peakidx <- res_idx
res
}
#' @description
#'
#' Check the input peaks table eventually replacing `NA` values in column `"mz"`
#' with the mean of columns `"mzmin"` and `"mzmax"` (if present).
#' This fixes issue #416.
#'
#' @param x peaks `matrix`.
#'
#' @return peaks `matrix`
#'
#' @noRd
#'
#' @md
.fix_mz_clust_peaks <- function(x) {
## Issue #416: fix for peaks with an m/z of NA.
nas <- is.na(x[, "mz"])
if (any(nas)) {
## if we have mzmin and mzmax use mean of them.
if (all(c("mzmin", "mzmax") %in% colnames(x)) &&
!any(is.na(x[nas, c("mzmin", "mzmax")]))) {
warning("Got ", sum(nas), " peaks with missing values in column ",
"'mz'. Replaced them with the mean of values in columns ",
"'mzmin' and 'mzmax' values.")
x[nas, "mz"] <- rowMeans(x[nas, c("mzmin", "mzmax")])
} else {
stop("Got ", sum(nas), " peaks with missing values in column 'mz'.")
}
}
x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.