#' @include hidden_aliases.R
NULL
.valid_processing_queue <- function(x) {
if (length(x) && !all(vapply1l(x, inherits, "ProcessingStep")))
return("'processingQueue' should only contain ProcessingStep objects.")
NULL
}
#' @description
#'
#' Apply the lazy evaluation processing queue to the peaks (i.e. m/z, intensity
#' matrix) and return the result.
#'
#' @param x `list` of peak `matrix`.
#'
#' @param msLevel `integer` with the MS level of each spectrum. Length has to
#' match `length(x)`.
#'
#' @param centroided `logical` whether spectra are centroided. Length has to
#' match `length(x)`.
#'
#' @param queue `list` of `ProcessStep` elements.
#'
#' @param spectraData `data.frame` with additional spectra variables as
#' requested with `addProcessing`.
#'
#' @return `list` of peak `matrix` elements.
#'
#' @author Johannes Rainer
#'
#' @importFrom ProtGenerics executeProcessingStep
#'
#' @noRd
.apply_processing_queue <- function(x, spectraData = data.frame(),
queue = NULL) {
x <- unname(x)
if (length(queue)) {
args <- list()
ls <- length(spectraData)
if (ls)
colnames(spectraData) <- sub("^msLevel$", "spectrumMsLevel",
colnames(spectraData))
for (i in seq_along(x)) {
if (ls)
args <- as.list(spectraData[i, ])
for (pStep in queue) {
x[[i]] <- do.call(pStep@FUN, args = c(x[i], args, pStep@ARGS))
}
}
}
x
}
.processingQueueVariables <- function(x) {
if (.hasSlot(x, "processingQueueVariables"))
x@processingQueueVariables
else c("msLevel", "centroided")
}
#' @title Apply arbitrary functions and processing queue to peaks matrices
#'
#' @description
#'
#' This function applies the processing queue and an arbitrary function to
#' the peaks matrix of each spectrum of the `Spectra` object `object`.
#'
#' @param object `Spectra` object.
#'
#' @param FUN optional function to be applied to the peaks matrix. The peaks
#' matrix will be passed to the first argument of the function. The function
#' should also have arguments `...`.
#'
#' @param spectraVariables `character` defining spectra variables that should
#' be passed to the function.
#'
#' @param ... optional additional arguments to `FUN`.
#'
#' @param f `factor` or `vector` that can be coerced to one defining how the
#' data should be split for parallel processing.
#'
#' @param BPPARAM parallel processing setup.
#'
#' @return `list` of `matrix` with the peaks.
#'
#' @author Johannes Rainer
#'
#' @importMethodsFrom BiocParallel bplapply
#'
#' @noRd
.peaksapply <- function(object, FUN = NULL, ...,
spectraVariables = .processingQueueVariables(object),
f = dataStorage(object), BPPARAM = bpparam()) {
len <- length(object)
if (!len)
return(list())
if (length(f) != len)
stop("Length of 'f' has to match 'length(object)' (", len, ")")
if (!is.factor(f))
f <- factor(f)
pqueue <- object@processingQueue
if (!is.null(FUN))
pqueue <- c(pqueue, ProcessingStep(FUN, ARGS = list(...)))
if (length(levels(f)) > 1 || length(pqueue)) {
res <- bplapply(split(object@backend, f), function(z, queue, svars) {
if (length(svars))
spd <- as.data.frame(spectraData(z, columns = svars))
else spd <- NULL
.apply_processing_queue(peaksData(z), spd, queue = queue)
}, queue = pqueue, svars = spectraVariables, BPPARAM = BPPARAM)
unsplit(res, f = f, drop = TRUE)
} else peaksData(object@backend)
}
#' @title Apply a function to subsets of Spectra
#'
#' @description
#'
#' Applying a function to a Spectra eventually splitting it into chunks for
#' parallel/serial processing. By default `x` is splitted by `dataStorage`
#' applying the function `FUN` to each chunk. This has the advantage of
#' possibly running calculations in parallel **and** requiring, depending on
#' the backend, less memory (e.g. in case of an on-disk backend only the m/z
#' and intensity values for the current chunk has to be loaded into memory).
#'
#' While being partially redundant to the `.peaksapply` function, this function
#' allows the function `FUN` to get access to ALL spectra variables.
#'
#' Note that results are returned as generated by `split`, i.e. in this order
#' and as a `list` of values. Eventually use `unlist` later.
#'
#' @param x `Spectra` object.
#'
#' @param FUN `function` to be applied to each spectrum.
#'
#' @param f factor to split `x` into chunks for parallel processing. Defaults
#' to splitting `x` by `dataStorage`.
#'
#' @param ... Additional parameters for `FUN`.
#'
#' @param BPPARAM Optional settings for `BiocParallel`-based parallel
#' processing. See [bpparam()] for more informations and options.
#'
#' @return A `list` with the result of `FUN`.
#'
#' @author Johannes Rainer
#'
#' @importFrom BiocParallel SerialParam
#'
#' @noRd
.lapply <- function(x, FUN, f = as.factor(dataStorage(x)), ...,
BPPARAM = SerialParam()) {
bplapply(split(x, f), FUN = FUN, ..., BPPARAM = BPPARAM)
}
#' @export applyProcessing
#'
#' @rdname Spectra
applyProcessing <- function(object, f = dataStorage(object),
BPPARAM = bpparam(), ...) {
if (!length(object@processingQueue))
return(object)
if (isReadOnly(object@backend))
stop(class(object@backend), " is read-only. 'applyProcessing' works ",
"only with backends that support writing data.")
if (!is.factor(f))
f <- factor(f, levels = unique(f))
if (length(f) != length(object))
stop("length 'f' has to be equal to the length of 'object' (",
length(object), ")")
bknds <- bplapply(split(object@backend, f = f), function(z, queue, svars) {
if (length(svars))
spd <- as.data.frame(spectraData(z, columns = svars))
else spd <- NULL
peaksData(z) <- .apply_processing_queue(peaksData(z), spd, queue)
z
}, queue = object@processingQueue,
svars = .processingQueueVariables(object), BPPARAM = BPPARAM)
bknds <- backendMerge(bknds)
if (is.unsorted(f))
bknds <- bknds[order(unlist(split(seq_along(bknds), f),
use.names = FALSE))]
object@backend <- bknds
object@processing <- .logging(object@processing,
"Applied processing queue with ",
length(object@processingQueue),
" steps")
object@processingQueue <- list()
if (!.hasSlot(object, "processingQueueVariables"))
object <- updateObject(object, check = FALSE)
object@processingQueueVariables <- character()
object
}
#' @description
#'
#' Simple helper function to test parameter msLevel. Returns `TRUE` if parameter
#' is OK, `FALSE` if a warning is thrown and throws an error if it is not
#' a numeric.
#'
#' @noRd
.check_ms_level <- function(object, msLevel) {
if (!length(object))
return(TRUE)
if (!is.numeric(msLevel))
stop("'msLevel' must be numeric")
if (!any(msLevel(object) %in% msLevel)) {
warning("Specified MS levels ", paste0(msLevel, collapse = ","),
" not available in 'object'")
FALSE
} else TRUE
}
#' @title Spectrum comparison
#'
#' @description
#'
#' Compare each spectrum in `x` with each spectrum in `y` with function `FUN`.
#' Mapping between the peaks in both spectra can be defined with the `MAPFUN`
#' function. This function *realizes* the peak matrices in chunks to balance
#' between performance (high `chunkSize`) and low memory demand
#' (small `chunkSize`).
#'
#' @note
#'
#' Results might be slightly different if `x` and `y` are switched, because of
#' the matching of the peaks by their m/z between spectra. Matching is performed
#' by applying a `tolerance` and `ppm` to the second spectrum (i.e. from `y`)
#' and, especially for `ppm`, the maximal accepted difference depend thus on
#' the m/z values from the second spectrum.
#'
#' @param x [Spectra()]
#'
#' @param y [Spectra()]
#'
#' @param FUN `function` to compare the (m/z matched) intensities from each
#' spectrum in `x` with those in `y`.
#'
#' @param MAPFUN `function` to map peaks between the two compared spectra. See
#' [joinPeaks()].
#'
#' @param tolerance `numeric(1)` allowing to define a constant maximal accepted
#' difference between m/z values for peaks to be matched.
#'
#' @param ppm `numeric(1)` allowing to define a relative, m/z-dependent,
#' maximal accepted difference between m/z values for peaks to be matched.
#'
#' @param chunkSize `integer(1)` defining the number of peak matrices that
#' should be realized (i.e. loaded into memory) in each iteration. Large
#' `chunkSize` will result in faster processing, small `chunkSize` in
#' lower memory demand. Defaults to `chunkSize = 10000`.
#'
#' @param ... additional parameters passed to `FUN` and `MAPFUN`.
#'
#' @return
#'
#' `matrix` with number of rows equal to length of `x` and number of columns
#' equal `length(y)`.
#'
#' @importFrom stats cor
#'
#' @examples
#'
#' library(Spectra)
#' fl <- system.file("TripleTOF-SWATH/PestMix1_SWATH.mzML", package = "msdata")
#' sps <- Spectra(fl, source = MsBackendMzR())
#'
#' sps <- sps[1:10]
#' res <- .compare_spectra_chunk(sps, sps, ppm = 20)
#'
#' res <- .compare_spectra_chunk(sps[1], sps, FUN = correlateSpectra)
#' res <- .compare_spectra_chunk(sps, sps[1], FUN = correlateSpectra)
#'
#' @noRd
.compare_spectra_chunk <- function(x, y = NULL, MAPFUN = joinPeaks,
tolerance = 0, ppm = 20,
FUN = ndotproduct,
chunkSize = 10000, ...) {
nx <- length(x)
ny <- length(y)
if (nx <= chunkSize && ny <= chunkSize) {
mat <- .peaks_compare(.peaksapply(x), .peaksapply(y), MAPFUN = MAPFUN,
tolerance = tolerance, ppm = ppm,
FUN = FUN, xPrecursorMz = precursorMz(x),
yPrecursorMz = precursorMz(y), ...)
dimnames(mat) <- list(spectraNames(x), spectraNames(y))
return(mat)
}
x_idx <- seq_len(nx)
x_chunks <- split(x_idx, ceiling(x_idx / chunkSize))
rm(x_idx)
y_idx <- seq_len(ny)
y_chunks <- split(y_idx, ceiling(y_idx / chunkSize))
rm(y_idx)
mat <- matrix(NA_real_, nrow = nx, ncol = ny,
dimnames = list(spectraNames(x), spectraNames(y)))
for (x_chunk in x_chunks) {
x_buff <- x[x_chunk]
x_peaks <- .peaksapply(x_buff)
for (y_chunk in y_chunks) {
y_buff <- y[y_chunk]
mat[x_chunk, y_chunk] <- .peaks_compare(
x_peaks, .peaksapply(y_buff),
MAPFUN = MAPFUN, tolerance = tolerance, ppm = ppm,
FUN = FUN, xPrecursorMz = precursorMz(x_buff),
yPrecursorMz = precursorMz(y_buff), ...)
}
}
mat
}
#' @description
#'
#' Compare each spectrum in `x` with each other spectrum in `x`. Makes use of
#' `combn` to avoid calculating combinations twice.
#'
#' @inheritParams .compare_spectra
#'
#' @importFrom utils combn
#'
#' @importFrom MsCoreUtils vapply1d
#' @examples
#'
#' res <- .compare_spectra(sps, sps)
#' res_2 <- .compare_spectra_self(sps)
#' all.equal(res, res_2) # comparison x[1], x[2] != x[2], x[1]
#' all.equal(res[!lower.tri(res)], res_2[!lower.tri(res_2)])
#'
#' @noRd
.compare_spectra_self <- function(x, MAPFUN = joinPeaks, tolerance = 0,
ppm = 20, FUN = ndotproduct, ...) {
nx <- length(x)
m <- matrix(NA_real_, nrow = nx, ncol = nx,
dimnames = list(spectraNames(x), spectraNames(x)))
cb <- which(lower.tri(m, diag = TRUE), arr.ind = TRUE)
pmz <- precursorMz(x)
for (i in seq_len(nrow(cb))) {
cur <- cb[i, 2L]
if (i == 1L || cb[i - 1L, 2L] != cur) {
py <- px <- peaksData(x[cur])[[1L]]
pmzx <- pmzy <- pmz[cur]
} else {
py <- peaksData(x[cb[i, 1L]])[[1L]]
pmzy <- pmz[cb[i, 1L]]
}
map <- MAPFUN(px, py, tolerance = tolerance, ppm = ppm,
xPrecursorMz = pmzx, yPrecursorMz = pmzy,
.check = FALSE,...)
m[cb[i, 1L], cur] <- m[cur, cb[i, 1L]] <-
FUN(map[[1L]], map[[2L]], xPrecursorMz = pmzx,
yPrecursorMz = pmzy, ...)
}
m
}
#' @description
#'
#' Combine a `Spectra` of length `n` to a `Spectra` of length `1`.
#' Takes all *spectra variables* from the first spectrum and combines the
#' peaks into a single peaks matrix.
#'
#' @note
#'
#' If the backend of the input `Spectra` is *read-only* we're first converting
#' that to a `MsBackendDataFrame` to support replacing peak data.
#'
#' @param x `Spectra`, ideally from a single `$dataStorage`.
#'
#' @param f `factor` to group spectra in `x`. It is supposed that input checks
#' have been performed beforehand.
#'
#' @param FUN `function` to be applied to the list of peak matrices.
#'
#' @return `Spectra` of length 1. If the input `Spectra` uses a read-only
#' backend that is changed to a `MsBackendDataFrame`.
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @examples
#'
#' spd <- DataFrame(msLevel = c(2L, 2L, 2L), rtime = c(1, 2, 3))
#' spd$mz <- list(c(12, 14, 45, 56), c(14.1, 34, 56.1), c(12.1, 14.15, 34.1))
#' spd$intensity <- list(c(10, 20, 30, 40), c(11, 21, 31), c(12, 22, 32))
#'
#' sps <- Spectra(spd)
#'
#' res <- .combine_spectra(sps)
#' res$mz
#' res$intensity
#'
#' res <- .combine_spectra(sps, FUN = combinePeaks, tolerance = 0.1)
#' res$mz
#' res$intensity
.combine_spectra <- function(x, f = x$dataStorage, FUN = combinePeaks, ...) {
if (!is.factor(f))
f <- factor(f)
else f <- droplevels(f)
x_new <- x[match(levels(f), f)]
if (isReadOnly(x_new@backend))
x_new <- setBackend(x_new, MsBackendDataFrame())
peaksData(x_new@backend) <- lapply(
split(.peaksapply(x, BPPARAM = SerialParam()), f = f), FUN = FUN, ...)
x_new@processingQueue <- list()
validObject(x_new)
x_new
}
#' Utility to concatenate a `list` of `Spectra` objects into a single `Spectra`.
#' This function is called in `c`.
#'
#' @param x `list` of `Spectra` objects.
#'
#' @return `Spectra`.
#'
#' @author Johannes Rainer
#'
#' @noRd
.concatenate_spectra <- function(x) {
cls <- vapply1c(x, class)
if (any(cls != "Spectra"))
stop("Can only concatenate 'Spectra' objects")
pqs <- lapply(x, function(z) z@processingQueue)
## For now we stop if there is any of the processingQueues not empty. Later
## we could even test if they are similar, and if so, merge.
if (any(lengths(pqs)))
stop("Can not concatenate 'Spectra' objects with non-empty ",
"processing queue. Consider calling 'applyProcessing' before.")
metad <- do.call(c, lapply(x, function(z) z@metadata))
procs <- unique(unlist(lapply(x, function(z) z@processing)))
object <- new(
"Spectra", metadata = metad,
backend = backendMerge(lapply(x, function(z) z@backend)),
processing = c(procs, paste0("Merge ", length(x),
" Spectra into one [", date(), "]"))
)
validObject(object)
object
}
#' @export concatenateSpectra
#'
#' @rdname Spectra
concatenateSpectra <- function(x, ...) {
.concatenate_spectra(unlist(unname(list(unname(x), ...))))
}
#' @export combineSpectra
#'
#' @rdname Spectra
combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage,
FUN = combinePeaks, ..., BPPARAM = bpparam()) {
if (!is.factor(f))
f <- factor(f, levels = unique(f))
if (!is.factor(p))
p <- factor(p, levels = unique(p))
if (length(f) != length(x) || length(p) != length(x))
stop("length of 'f' and 'p' have to match length of 'x'")
if (isReadOnly(x@backend))
message("Backend of the input object is read-only, will change that",
" to an 'MsBackendDataFrame'")
if (nlevels(p) > 1) {
## We split the workload by storage file. This ensures memory efficiency
## for file-based backends.
res <- bpmapply(FUN = .combine_spectra, split(x, p), split(f, p),
MoreArgs = list(FUN = FUN, ...), BPPARAM = BPPARAM)
.concatenate_spectra(res)
} else
.combine_spectra(x, f = f, FUN = FUN, ...)
}
#' @description
#'
#' Internal function to check if any (or all) of the provided `mz` values are
#' in the spectras' m/z.
#'
#' @param x `Spectra` object
#'
#' @param mz `numeric` of m/z value(s) to check in each spectrum of `x`.
#'
#' @param tolarance `numeric(1)` with the tolerance.
#'
#' @param ppm `numeric(1)` with the ppm.
#'
#' @param condFun `function` such as `any` or `all`.
#'
#' @param parallel `BiocParallel` parameter object.
#'
#' @return `logical` same length than `x`.
#'
#' @author Johannes Rainer
#'
#' @importFrom MsCoreUtils common
#'
#' @noRd
.has_mz <- function(x, mz = numeric(), tolerance = 0, ppm = 20, condFun = any,
parallel = SerialParam()) {
mzs <- mz(x, BPPARAM = parallel)
vapply(mzs, FUN = function(z)
condFun(common(mz, z, tolerance = tolerance, ppm = ppm)), logical(1))
}
#' @description
#'
#' Same as `.has_mz` only that a different `mz` is used for each spectrum in
#' `x`. Length of `mz` is thus expected to be equal to length of `x`.
#'
#' @param mz `numeric` **same length as `x`**.
#'
#' @author Johannes Rainer
#'
#' @noRd
.has_mz_each <- function(x, mz = numeric(), tolerance = 0, ppm = 20,
parallel = SerialParam()) {
mzs <- mz(x, BPPARAM = parallel)
mapply(mzs, mz, FUN = function(z, y) {
if (is.na(y))
NA # if m/z is NA is better to return NA instead of FALSE
else common(y, z, tolerance = tolerance, ppm = ppm)
})
}
#' @export joinSpectraData
#'
#' @rdname Spectra
joinSpectraData <- function(x, y,
by.x = "spectrumId",
by.y,
suffix.y = ".y") {
stopifnot(inherits(x, "Spectra"))
stopifnot(inherits(y, "DataFrame"))
if (missing(by.y))
by.y <- by.x
x_vars <- spectraVariables(x)
y_vars <- names(y)
if (length(by.x) != 1 | length(by.y) != 1)
stop("'by.x' and 'by.y' must be of length 1.")
if (!is.character(by.x) | !is.character(by.y))
stop("'by.x' and 'by.y' must be characters.")
if (any(duplicated(x_vars)))
stop("Duplicated spectra variables in 'x'.")
if (any(duplicated(y_vars)))
stop("Duplicated names in 'y'.")
if (!by.x %in% x_vars)
stop("'by.x' not found in spectra variables.")
if (!by.y %in% y_vars)
stop("'by.y' not found.")
## Keep only rows that (1) have a non-NA by.y and (2) that are in
## x[[by.x]] (given that we do a left join here).
y <- y[!is.na(y[[by.y]]), ]
y <- y[y[[by.y]] %in% spectraData(x)[[by.x]], ]
k <- match(y[[by.y]], spectraData(x)[[by.x]])
n <- length(x)
## Don't need by.y anymore
y_vars <- y_vars[-grep(by.y, y_vars)]
y <- y[, y_vars]
## Check if there are any shared x_vars and y_vars. If so, the
## y_vars are appended suffix.y.
if (length(xy_vars <- intersect(x_vars, y_vars))) {
y_vars[y_vars %in% xy_vars] <- paste0(y_vars[y_vars %in% xy_vars], suffix.y[1])
names(y) <- y_vars
}
for (y_var in y_vars) {
## Initialise the new column as a list, vector or List (in
## that order!)
if (is.list(y[[y_var]])) {
x[[y_var]] <- vector("list", length = n)
} else if (is.vector(y[[y_var]])) {
x[[y_var]] <- rep(NA, n)
} else if (inherits(y[[y_var]], "List")) {
x[[y_var]] <- as(vector("list", length = n), class(y[[y_var]]))
}
## Populate the new column
x[[y_var]][k] <- y[[y_var]]
}
x
}
#' @export
#'
#' @rdname Spectra
processingLog <- function(x) {
x@processing
}
#' @export
#'
#' @rdname Spectra
estimatePrecursorIntensity <- function(x, ppm = 10, tolerance = 0,
method = c("previous", "interpolation"),
msLevel. = 2L, f = dataOrigin(x),
BPPARAM = bpparam()) {
if (is.factor(f))
f <- as.character(f)
f <- factor(f, levels = unique(f))
unlist(bplapply(split(x, f), FUN = .estimate_precursor_intensity, ppm = ppm,
tolerance = tolerance, method = method, msLevel = msLevel.,
BPPARAM = BPPARAM), use.names = FALSE)
}
#' estimate precursor intensities based on MS1 peak intensity. This function
#' assumes that `x` is a `Spectra` with data **from a single file/sample**.
#'
#' @author Johannes Rainer
#'
#' @importFrom stats approx
#'
#' @noRd
.estimate_precursor_intensity <- function(x, ppm = 10, tolerance = 0,
method = c("previous",
"interpolation"),
msLevel = 2L) {
method <- match.arg(method)
if (any(msLevel > 2))
warning("Estimation of precursor intensities for MS levels > 2 is ",
"not yet validated")
if (!length(x))
return(numeric())
if (is.unsorted(rtime(x))) {
idx <- order(rtime(x))
x <- x[idx]
} else idx <- integer()
pmz <- precursorMz(x)
pmi <- rep(NA_real_, length(pmz))
idx_ms2 <- which(!is.na(pmz) & msLevel(x) == msLevel)
x_ms1 <- filterMsLevel(x, msLevel = msLevel - 1L)
pks <- .peaksapply(x_ms1)
for (i in idx_ms2) {
rt_i <- rtime(x[i])
before_idx <- which(rtime(x_ms1) < rt_i)
before_int <- NA_real_
before_rt <- NA_real_
if (length(before_idx)) {
before_idx <- before_idx[length(before_idx)]
before_rt <- rtime(x_ms1)[before_idx]
peaks <- pks[[before_idx]]
peak_idx <- closest(pmz[i], peaks[, "mz"],
tolerance = tolerance,
ppm = ppm, duplicates = "closest",
.check = FALSE)
before_int <- unname(peaks[peak_idx, "intensity"])
}
if (method == "interpolation") {
after_idx <- which(rtime(x_ms1) > rt_i)
after_int <- NA_real_
after_rt <- NA_real_
if (length(after_idx)) {
after_idx <- after_idx[1L]
after_rt <- rtime(x_ms1)[after_idx]
peaks <- pks[[after_idx]]
peak_idx <- closest(pmz[i], peaks[, "mz"],
tolerance = tolerance,
ppm = ppm, duplicates = "closest",
.check = FALSE)
after_int <- unname(peaks[peak_idx, "intensity"])
}
if (is.na(before_int)) {
before_int <- after_int
} else {
if (!is.na(after_int))
before_int <- approx(c(before_rt, after_rt),
c(before_int, after_int),
xout = rt_i)$y
}
}
pmi[i] <- before_int
}
if (length(idx))
pmi <- pmi[order(idx)]
pmi
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.