R/functions-OnDiskMSnExp.R

Defines functions .on_disk_split_by_file isCentroidedFromFile .extractMultipleChromatograms .applyFun2IndividualSpectraOfFile .applyFun2SpectraOfFileMulti precursorValue_OnDiskMSnExp validateOnDiskMSnExp validateFeatureDataForOnDiskMSnExp subsetFeatureDataBy

Documented in isCentroidedFromFile validateOnDiskMSnExp

############################################################
## subsetFeatureDataBy
##
## Convenience function to subset a OnDiskMSnExp featureData based on
## provided subsetting criteria:
## o index: subset by numeric index, logical or character. Character
##    indices are "forwarded" to argument "name". index refers to the
##    rows in the feature data.
## o scanIdx: a numeric is expected, specifying the scan index. If a
##    character vector is provided, it is "forwarded" to
##    argument "name".
## o scanIdxCol: the column of the featureData containing the scan indices.
## o name: a character vector, matching is performed using the row names
##    of the fd data.frame.
## o rtlim: a numeric of length 2 specifying the retention time window
##    from which spectra should be extracted.
## o msLevel: a numeric of MS level.
## Open questions:
## 1) Should we re-calculate the spIdx or the seqNum if we're subsetting?
## Note and comments:
## o At present filtering by multiple criteria (e.g. index and name) is not
##    possible.
subsetFeatureDataBy <- function(fd, index = NULL, scanIdx = NULL,
                                scanIdxCol = "acquisitionNum", name = NULL,
                                rtlim = NULL, msLevel = NULL) {
    valMsg <- validateFeatureDataForOnDiskMSnExp(fd)
    if (is.character(valMsg))
        stop(valMsg)
    ## o index
    if (length(index) > 0) {
        if (is.logical(index)) {
            if (length(index) != nrow(fd))
                stop("If 'index' is a logical vector its length has to match",
                     " the number of",
                     " rows of the featureData!")
            index <- which(index)
        }
        if (is.numeric(index)) {
            gotIt <- index %in% 1:nrow(fd)
            if (!any(gotIt))
                stop("Provided indices are outside of the allowed range.")
            if (any(!gotIt))
                warning("Some of the provided indices are outside of the",
                        " allowed range.")
            index <- index[gotIt]
            return(fd[index, , drop = FALSE])
        }
        if (is.character(index))
            name <- index
    }
    ## o scanIdx
    if (length(scanIdx) > 0) {
        if(is.numeric(scanIdx)){
            gotIt <- scanIdx %in% fd[, scanIdxCol]
            if(!any(gotIt))
                stop("None of the provided scan indices are available!")
            if(!all(gotIt))
                warning("Some of the provided scan indices are not available.")
            return(fd[which(fd[, scanIdxCol] %in% scanIdx), , drop=FALSE])
        }
        if (is.character(scanIdx))
            name <- scanIdx
    }
    ## o name: subset by name, match to rownames.
    if (length(name) > 0) {
        gotIt <- name %in% rownames(fd)
        if (!any(gotIt))
            stop("None of the provided names found.")
        if (!all(gotIt))
            warning("Some of the provided names do not match featureData rownames.")
        name <- name[gotIt]
        return(fd[name, , drop=FALSE])
    }
    ## o rtlim: subset by retention time range.
    if (length(rtlim > 0)){
        if (length(rtlim) > 2 | !is.numeric(rtlim))
            stop("Argument 'rtlim' has to be a numeric vector of length 2 specifying",
                 " the retention time window (range).")
        gotIt <- which(fd$retentionTime >= rtlim[1] & fd$retentionTime <= rtlim[2])
        if (length(gotIt) == 0)
            stop("No spectrum within the specified retention time window.")
        fd <- fd[gotIt, , drop=FALSE]
    }
    ## o msLevel
    fd <- fd[fd$msLevel %in% msLevel, , drop = FALSE]
    if (nrow(fd) == 0)
        warning("No spectra with the specified MS level present.")
    return(fd)
}

## Columns absolutely required in the object' featureData (issue #283).
.MSnExpReqFvarLabels <- c("fileIdx", "spIdx", "acquisitionNum",
                          "retentionTime", "msLevel", "precursorScanNum")

## Returns either NULL or a character string.
validateFeatureDataForOnDiskMSnExp <- function(x) {
    ## Testing if we've got all the required columns! See issue 105
    ## for a discussion about originalTotIonCurrent and
    ## originalPeaksCount.
    NotPresent <- .MSnExpReqFvarLabels[!(.MSnExpReqFvarLabels %in% colnames(x))]
    if (length(NotPresent) > 0)
        return(paste0("Required columns: ",
                      paste(sQuote(NotPresent), collapse = ","),
                      " not present in featureData!"))
    return(NULL)
}

############################################################
## validateOnDiskMSnExp
##
## The validation method that might be called manually. In addition to the
## validate function called by validObject this ensures also that all
## spectra objects are valid and thus re-reads the raw data.
## o mzTolerance: allowed tolerance in comparison of spectras' M/Z ranges
##    with featureData's lowMZ and highMZ.
validateOnDiskMSnExp <- function(object, mzTolerance=1e-6) {
    ## First call the basic validity.
    validObject(object)
    ## Now check validity of the spectra; if one non-valid object is found we stop.
    tmp <- spectrapply(object, FUN = function(z) {
        res <- validObject(z)
        if (is(res, "character"))
            stop(res)
    })
    ## Spectra M/Z range: spectra M/Z ranges should be within lowMZ and highMZ.
    ## Get the mzrange by spectrum.
    res <- spectrapply(object, FUN = function(x) {
        return(range(mz(x), na.rm=TRUE))
    })
    ## Loop through the spectra and check that mzrange is within
    ## lowMZ and highMZ
    fd <- fData(object)
    emptyMz <- 0
    for (i in seq_len(length(object))) {
        ## Check if lowMZ or highMZ are 0; in that case skip and throw
        ## a warning, see https://github.com/lgatto/MSnbase/issues/153
        if (fd$lowMZ[i] == 0L & fd$highMZ[i] == 0L) {
            emptyMz <- emptyMz + 1
        } else {
            hgeq <- isTRUE(base::all.equal(res[[i]][2], fd$highMZ[i],
                                           tolerance = mzTolerance))
            lweq <- isTRUE(base::all.equal(res[[i]][1], fd$lowMZ[i],
                                           tolerance = mzTolerance))
            if (!(hgeq & lweq))
                stop("M/Z values of spectrum ", i, " don't match ",
                     "lowMZ and highMZ.")
        }
    }
    if (emptyMz > 0)
        warning("Header value lowMZ and highMZ is 0 for ",
                emptyMz, " spectra.")

    ## Check that msLevel of spectra matches msLevel of featureData.
    if (any(fData(object)$msLevel != sapply(spectra(object), msLevel)))
        stop("msLevel in featureData does not match msLevel from the spectra!")
    return(TRUE)
}

############################################################
## precursorValue_OnDiskMSnExp
##
## Returns requested information from the featureData, ensuring
## that data for MS1 is set to NA. Throws an error if the
## object contains only MS1 data.
precursorValue_OnDiskMSnExp <- function(object, column) {
    ## Throw an error if we've got only MS1:
    if (all(unique(msLevel(object)) == 1))
        stop("This experiment contains MS1 spectra.")
    ps <- fData(object)[, column]
    names(ps) <- featureNames(object)
    ## Replacing values for MS 1 with NA.
    ps[msLevel(object) == 1] <- NA
    return(ps)
}

#' @title Apply a function to spectra loaded from a single file
#'
#' @description This function creates `Spectrum1` and `Spectrum2` objects for
#'     the specicied spectra in one file and applies the provided function to
#'     each of them.
#'
#' @details The function processes MS level 1 and > 1 separately, first reading
#'     all mz/intensity values for each spectrum and creating then the list
#'     of `Spectrum1` or `Spectrum2` objects using a constructor written in C
#'     that creates all spectra in one call (i.e. using a for loop in C).
#'     After that, depending on the arguments `queue` and `APPLYFUN` a `lapply`
#'     call is performed applying the respective calls to each spectrum. Finally
#'     results are ordered and returned.
#'
#' @note Using the C constructor that takes all values at once and creates a
#'     list of `Spectrum1` objects, applies processing steps, applies the
#'     provided function and returns its results - or the list of
#'     `Spectrum1` objects if `APPLYFUN = NULL`.
#'
#'     Note: enforces ordering of M/Z-intensity pairs by M/Z.
#'
#' @param fData: either a full `data.frame` (returned by `fData(OnDiskMSnExp))`
#'     or a sub-set forspecific spectra. The data.frame should ONLY
#'     CONTAIN VALUES FOR SPECTRA OF ONE FILE!
#'
#' @param filenames: `fileNames(object)` with `object` being an `OnDiskMSnExp`.
#'
#' @param queue: `object@spectraProcessingQueue`; if `lenght > 0` all
#'     processing steps will be applied to the created spectrum
#'     objects.
#'
#' @param APPLYFUN: the function to be applied to the spectrum objects
#'     (such as `ionCount` etc). If `NULL` the function returns the list of
#'     spectrum objects.
#'
#' @param fastLoad: `logical(1)` whether reading the spectras' header data
#'     should be omitted prior to retrieving the data (i.e. skip the
#'     `mzR::header` call before calling `mzR::peaks`. The former call might be
#'     required on some systems (so far macOS) for some files.
#'
#' @param ...: additional arguments for the APPLYFUN
#'
#' @return `list` with either spectrum objects or the results of the function
#'     provided with argument `APPLYFUN`.
#'
#' @author Johannes Rainer
#'
#' @noRd
#'
#' @md
.applyFun2SpectraOfFileMulti <- function(fData, filenames,
                                         queue = NULL,
                                         APPLYFUN = NULL,
                                         fastLoad = TRUE,
                                         ...) {
    suppressPackageStartupMessages(
        require(MSnbase, quietly = TRUE)
    )
    verbose. <- isMSnbaseVerbose()
    if (missing(fData) | missing(filenames))
        stop("Both 'fData' and 'filenames' are required!")
    filename <- filenames[fData[1, "fileIdx"]]
    ## issue #214: define backend based on file format.
    fileh <- .openMSfile(filename)
    msLevel1 <- which(fData$msLevel == 1)
    msLevelN <- which(fData$msLevel > 1)
    ## Reading the header for the selected spectra. This is to avoid getting
    ## "memory not mapped" errors when reading mz and intensity values from
    ## certain mzML files (issue #170). Since this problem seems to be absent
    ## on linux and Windows systems we allow the user to disable it.
    ## Also we are just reading the header for the last spectrum since that
    ## seems to fix it too.
    if (!fastLoad)
        hd_spectra <- mzR::header(fileh, max(fData$spIdx))
    ## Process MS1 and MSn separately
    if (length(msLevel1) >= 1) {
        ms1fd <- fData[msLevel1, , drop = FALSE]
        ## Reading all of the data in "one go". According to issue
        ## #103 we should use acquisitionNum, not spectrum idx.
        ## See issue #118 for an explanation of the match
        ## allSpect <- mzR::peaks(fileh,
        ##                        match(ms1fd$acquisitionNum, hd$acquisitionNum))
        ## spIdx is the index of the spectra in the provided header, thus we
        ## don't require to load the header and do the matching.
        allSpect <- mzR::peaks(fileh, ms1fd$spIdx)
        ## If we have more than one spectrum the peaks function returns a list.
        if (is(allSpect, "list")) {
            nValues <- base::lengths(allSpect, use.names = FALSE) / 2
            allSpect <- do.call(rbind, allSpect)
        } else {
            ## otherwise it's a matrix, e.g. if only a single scan
            ## index was provided.
            nValues <- nrow(allSpect)
        }
        ## Call the C-constructor to create a list of Spectrum1
        ## objects.
        ## Sorting of M/Z values as discussed in issue #135
        ## Benchmarks for this: issue #136
        res <- Spectra1_mz_sorted(peaksCount = nValues,
                                  rt = ms1fd$retentionTime,
                                  acquisitionNum = ms1fd$acquisitionNum,
                                  scanIndex = ms1fd$spIdx,
                                  tic = ms1fd$totIonCurrent,
                                  mz = allSpect[, 1],
                                  intensity = allSpect[, 2],
                                  fromFile = ms1fd$fileIdx,
                                  centroided = ms1fd$centroided,
                                  smoothed = ms1fd$smoothed,
                                  polarity = ms1fd$polarity,
                                  nvalues = nValues)
        names(res) <- rownames(ms1fd)
    } else {
        res <- list()
    }
    if (length(msLevelN) >= 1) {
        msnfd <- fData[msLevelN, , drop = FALSE]
        ## Reading all of the data in "one go".
        ## See issue #118 for an explanation of the match/no match.
        ## allSpect <- mzR::peaks(fileh,
        ##                        match(msnfd$acquisitionNum, hd$acquisitionNum))
        allSpect <- mzR::peaks(fileh, msnfd$spIdx)
        ## If we have more than one spectrum the peaks function returns a list.
        if (is(allSpect, "list")) {
            nValues <- as.integer(base::lengths(allSpect, use.names = FALSE) / 2)
            allSpect <- do.call(rbind, allSpect)
        } else {
            ## otherwise it's a matrix, e.g. if only a single scan
            ## index was provided.
            nValues <- nrow(allSpect)
        }
        ## Call the C-constructor to create a list of Spectrum2
        ## objects.
        ## Sorting of M/Z values as discussed in issue #135
        res2 <- Spectra2_mz_sorted(msLevel = msnfd$msLevel,
                                   peaksCount = nValues,
                                   rt = msnfd$retentionTime,
                                   acquisitionNum = msnfd$acquisitionNum,
                                   scanIndex = msnfd$spIdx,
                                   tic = msnfd$totIonCurrent,
                                   mz = allSpect[, 1],
                                   intensity = allSpect[, 2],
                                   fromFile = msnfd$fileIdx,
                                   centroided = msnfd$centroided,
                                   smoothed = msnfd$smoothed,
                                   polarity = msnfd$polarity,
                                   merged = msnfd$mergedScan,
                                   precScanNum = msnfd$precursorScanNum,
                                   precursorMz = msnfd$precursorMZ,
                                   precursorIntensity = msnfd$precursorIntensity,
                                   precursorCharge = msnfd$precursorCharge,
                                   collisionEnergy = msnfd$collisionEnergy,
                                   nvalues = nValues)
        names(res2) <- rownames(msnfd)
        res <- c(res, res2)
        rm(res2)
    }
    mzR::close(fileh)
    rm(fileh)
    ## Intermediate #151 fix. Performance-wise would be nice to get rid of this.
    ## gc()
    ## If we have a non-empty queue, we might want to execute that too.
    do_queue <- length(queue) > 0
    do_apply <- !is.null(APPLYFUN)
    if (do_apply | do_queue){
        if (do_queue) {
            if (verbose.) {
                message("Apply lazy processing step(s):")
                for (j in 1:length(queue))
                    message(" o '", queue[[j]]@FUN, "' with ",
                            length(queue[[j]]@ARGS), " argument(s).")
            }
        }
        res <- lapply(res, FUN = function(z, theQ, APPLF, ...){
            ## Apply the processing steps.
            if (do_queue) {
                for (pStep in theQ) {
                    z <- executeProcessingStep(pStep, z)
                }
            }
            if (do_apply) {
                return(do.call(APPLF, args = c(list(z), ...)))
            } else {
                return(z)
            }
        }, theQ = queue, APPLF = APPLYFUN, ...)
    }
    ## Ensure that ordering is the same than in fData:
    res[match(rownames(fData), names(res))]
}

#' @description Less memory demanding version of `applyFun2SpectraOfFileMulti`.
#'
#' @details This function loops through the rows of the `data.frame` passed
#'     with argument `fData`, loads the mz and intensity values of the
#'     respective spectrum from the original file, creates a `Spectrum` object,
#'     executes eventual *lazy evaluation* steps (argument `queue`) and
#'     executes, if provided, the `APPLYFUN` to the `Spectrum`.
#'
#' @note The function uses the *C* constructor for `Spectrum` objects that
#'     enforces also ordering of M/Z-intensity pairs by M/Z.
#'     Performance wise, this function is fast on gzipped mzML files but should
#'     not be used on CDF files!
#'
#' @param fData: either a full `data.frame` (returned by `fData(OnDiskMSnExp))`
#'     or a sub-set forspecific spectra. The data.frame should ONLY
#'     CONTAIN VALUES FOR SPECTRA OF ONE FILE!
#'
#' @param filenames: `fileNames(object)` with `object` being an `OnDiskMSnExp`.
#'
#' @param queue: `object@spectraProcessingQueue`; if `lenght > 0` all
#'     processing steps will be applied to the created spectrum
#'     objects.
#'
#' @param APPLYFUN: the function to be applied to the spectrum objects
#'     (such as `ionCount` etc). If `NULL` the function returns the list of
#'     spectrum objects.
#'
#' @param fastLoad: `logical(1)` whether reading the spectras' header data
#'     should be omitted prior to retrieving the data (i.e. skip the
#'     `mzR::header` call before calling `mzR::peaks`. The former call might be
#'     required on some systems (so far macOS) for some files.
#'
#' @param ...: additional arguments for the APPLYFUN
#'
#' @return `list` with either spectrum objects or the results of the function
#'     provided with argument `APPLYFUN`.
#'
#' @author Johannes Rainer
#'
#' @noRd
#' @md
.applyFun2IndividualSpectraOfFile <- function(fData, filenames,
                                                  queue = NULL,
                                                  APPLYFUN = NULL,
                                                  fastLoad = TRUE,
                                                  ...) {
    suppressPackageStartupMessages(
        require(MSnbase, quietly = TRUE)
    )
    verbose. <- isMSnbaseVerbose()
    if (missing(fData) | missing(filenames))
        stop("Both 'fData' and 'filenames' are required!")
    if (length(queue) > 0) {
        if (verbose.) {
            message("Apply lazy processing step(s):")
            for (j in 1:length(queue))
                message(" o '", queue[[j]]@FUN, "' with ",
                        length(queue[[j]]@ARGS), " argument(s).")
        }
    }
    filename <- filenames[fData$fileIdx[1]]
    ## issue #214: define backend based on file format.
    fileh <- .openMSfile(filename)
    ## Reading the header for the selecte spectra. This is to avoid getting
    ## "memory not mapped" errors when reading mz and intensity values from
    ## certain mzML files (issue #170). Since this problem seems to be absent
    ## on linux and Windows systems we allow the user to disable it.
    ## Also we are just reading the header for the last spectrum since that
    ## seems to fix it too.
    if (!fastLoad)
        hd_spectra <- mzR::header(fileh, max(fData$spIdx))
    n_rows <- nrow(fData)
    do_queue <- length(queue) > 0
    do_apply <- !is.null(APPLYFUN)
    res <- vector("list", n_rows)
    for (i in 1:n_rows) {
        pks <- mzR::peaks(fileh, fData$spIdx[i])
        if (fData$msLevel[i] == 1) {
            sp <- Spectrum1_mz_sorted(rt = fData$retentionTime[i],
                                      acquisitionNum = fData$acquisitionNum[i],
                                      scanIndex = fData$spIdx[i],
                                      tic = fData$totIonCurrent[i],
                                      mz = pks[, 1],
                                      intensity = pks[, 2],
                                      fromFile = fData$fileIdx[i],
                                      centroided = fData$centroided[i],
                                      smoothed = fData$smoothed[i],
                                      polarity = fData$polarity[i])
        } else {
            sp <- Spectrum2_mz_sorted(msLevel = fData$msLevel[i],
                                      rt = fData$retentionTime[i],
                                      acquisitionNum = fData$acquisitionNum[i],
                                      scanIndex = fData$spIdx[i],
                                      tic = fData$totIonCurrent[i],
                                      mz = pks[, 1],
                                      intensity = pks[, 2],
                                      fromFile = fData$fileIdx[i],
                                      centroided = fData$centroided[i],
                                      smoothed = fData$smoothed[i],
                                      polarity = fData$polarity[i],
                                      merged = fData$mergedScan[i],
                                      precScanNum = fData$precursorScanNum[i],
                                      precursorMz = fData$precursorMZ[i],
                                      precursorIntensity = fData$precursorIntensity[i],
                                      precursorCharge = fData$precursorCharge[i],
                                      collisionEnergy = fData$collisionEnergy[i])
        }
        ## And now go through the processing queue - if not empty...
        if (do_queue) {
            for (pStep in queue) {
                sp <- executeProcessingStep(pStep, sp)
            }
        }
        ## Apply the function, if provided
        if (do_apply)
            res[[i]] <- do.call(APPLYFUN, args = c(list(sp), ...))
        else
            res[[i]] <- sp
    }
    names(res) <- rownames(fData)
    mzR::close(fileh)
    rm(fileh)
    ## Intermediate #151 fix. Performance-wise would be nice to get rid of this.
    ## gc()
    res
}


## Extract chromatogram(s).
#' @description This function extracts chromatograms efficiently for multiple
#'     rt and mz ranges by loading the data per file only once and performing
#'     the mz subsetting on the already loaded Spectrum1 classes.
#'
#' @note x has to be an MSnExp or OnDiskMSnExp object. The function is optimized
#'     for OnDiskMSnExp objects such that extraction is performed in parallel
#'     for each file.
#'
#' @param rt \code{matrix} with two columns and number of rows corresponding to
#'     the number of ranges to extract. If the number of columns of the matrix
#'     is not equal to 2, \code{range} is called on each row.
#'
#' @param mz \code{matrix} with two columns and number of rows corresponding to
#'     the number of ranges to extract. nrow of rt and mz have to match. If the
#'     number of columns of the matrix is not equal to 2, \code{range} is
#'     called on each row.
#'
#' @param x OnDiskMSnExp object from which to extract the chromatograms.
#'
#' @param missingValue value to be used as intensity if no signal was measured
#'     for a given rt.
#'
#' @param msLevel \code{integer(1)} ensuring that the chromatogram is extracted
#'     only for a specified MS level.
#'
#' @return A \code{matrix} with the \code{Chromatogram} objects with rows
#'     corresponding to ranges and columns to files/samples. \code{result[, 1]}
#'     will thus return a \code{list} of \code{Chromatogram} objects for the
#'     first sample/file, while \code{result[1, ]} returns a \code{list} of
#'     \code{Chromatogram} objects for the same rt/mz range for all files.
#'
#' @author Johannes Rainer
#'
#' @noRd
.extractMultipleChromatograms <- function(x, rt, mz, aggregationFun = "sum",
                                          BPPARAM = bpparam(),
                                          missingValue = NA_real_,
                                          msLevel = 1L) {
    missingValue <- as.numeric(missingValue)
    if (!any(.SUPPORTED_AGG_FUN_CHROM == aggregationFun))
        stop("'aggregationFun' should be one of ",
             paste0("'", .SUPPORTED_AGG_FUN_CHROM, "'", collapse = ", "))
    ## Ensure we're working on one MS level only!
    fns <- fileNames(x)
    x <- filterMsLevel(x, msLevel)
    if (length(x) == 0) {
        warning("No MS ", msLevel, " data present.")
        return(matrix(ncol = length(fns), nrow = 0))
    }
    if (missing(rt))
        rt <- matrix(c(-Inf, Inf), nrow = 1)
    if (missing(mz))
        mz <- matrix(c(-Inf, Inf), nrow = 1)
    ## Calculate the range for each row in rt
    if (ncol(rt) != 2)
        rt <- t(apply(rt, MARGIN = 1, range))
    ## Replicate if nrow rt is 1 to match nrow of mz.
    if (nrow(rt) == 1)
        rt <- matrix(rep(rt, nrow(mz)), ncol = 2, byrow = TRUE)
    if (ncol(mz) != 2)
        mz <- t(apply(mz, MARGIN = 1, range))
    if (nrow(mz) == 1)
        mz <- matrix(rep(mz, nrow(rt)), ncol = 2, byrow = TRUE)
    if (nrow(rt) != nrow(mz))
        stop("dimensions of 'rt' and 'mz' have to match")

    ## Identify indices of all spectra that are within the rt ranges.
    rtimes <- rtime(x)
    keep_idx <- unlist(apply(rt, MARGIN = 1, function(z)
        which(rtimes >= z[1] & rtimes <= z[2])), use.names = FALSE)
    keep_idx <- sort(unique(as.integer(keep_idx)))
    if (length(keep_idx) == 0)
        return(matrix(ncol = length(fns), nrow = 0))
    ## 1) Subset x keeping all spectra that fall into any of the provided rt
    ##    ranges.
    subs <- x[keep_idx]

    ## 2) Call the final subsetting on each file separately.
    subs_by_file <- splitByFile(subs, f = factor(seq_along(fileNames(subs))))
    suppressWarnings(
        res <- bpmapply(
            subs_by_file,
            match(fileNames(subs), fns),
            FUN = function(cur_sample, cur_file, rtm, mzm, aggFun) {
                ## Load all spectra for that file. applies also any proc steps
                sps <- spectra(cur_sample)
                ## Related to issue #229: can we avoid getting all spectra and
                ## just return the intensity values for each spectrum instead?
                rts <- rtime(cur_sample)
                cur_res <- vector("list", nrow(rtm))
                ## Loop through rt and mz.
                for (i in 1:nrow(rtm)) {
                    ## - Select all spectra within that range and call a
                    ##   function on them that does first filterMz and then
                    ##   aggregate the values per spectrum.
                    in_rt <- rts >= rtm[i, 1] & rts <= rtm[i, 2]
                    ## Return an empty Chromatogram if there is no spectrum/scan
                    ## within the retention time range.
                    if (!any(in_rt)) {
                        cur_res[[i]] <- MSnbase::Chromatogram(
                            filterMz = mzm[i, ],
                            fromFile = as.integer(cur_file),
                            aggregationFun = aggFun)
                        next
                    }
                    cur_sps <- lapply(
                        sps[in_rt],
                        function(spct, filter_mz, aggFun) {
                            spct <- filterMz(spct, filter_mz)
                            ## Now aggregate the values.
                            if (!spct@peaksCount)
                                return(c(NA_real_, NA_real_, missingValue,
                                         NA_real_))
                            c(range(spct@mz, na.rm = TRUE, finite = TRUE),
                              do.call(aggFun, list(spct@intensity,
                                                   na.rm = TRUE)),
                              spct@msLevel)
                        }, filter_mz = mzm[i, ], aggFun = aggFun)
                    ## Now build the Chromatogram class.
                    allVals <- unlist(cur_sps, use.names = FALSE)
                    ## Index for intensity values.
                    int_idx <- seq(3, length(allVals), by = 4)
                    ## Index for MS level
                    mslevel_idx <- seq(4, length(allVals), by = 4)
                    ints <- allVals[int_idx]
                    names(ints) <- names(cur_sps)
                    ## If no measurement is non-NA, still report the NAs and
                    ## use the filter mz as mz. We hence return a
                    ## Chromatogram with retention times of all spectra
                    ## within the mz/rt slice, but with all intensity
                    ## values being NA
                    mz_range <- mzm[i, ]
                    if (!all(is.na(ints)))
                        mz_range <- range(allVals[-c(int_idx, mslevel_idx)],
                                          na.rm = TRUE, finite = TRUE)
                    cur_res[[i]] <- MSnbase::Chromatogram(
                        rtime = rts[in_rt],
                        intensity = ints,
                        mz = mz_range,
                        filterMz = mzm[i, ],
                        fromFile = as.integer(cur_file),
                        aggregationFun = aggFun,
                        msLevel = as.integer(unique(allVals[mslevel_idx]))
                        )
                }
                cur_res
            }, MoreArgs = list(rtm = rt, mzm = mz, aggFun = aggregationFun),
            BPPARAM = BPPARAM, SIMPLIFY = FALSE)
    )
    ## Ensure that the lists have the same length than there are samples!
    fromF <- base::match(fileNames(subs), fns)

    ## Jumping here if we have a file with no spectra within the rt range.
    ## If we've got some files in which we don't have any signal in any range,
    ## fill it with empty Chromatograms. This ensures that the result has
    ## ALWAYS the same length/number of columns than there are samples.
    if (length(res) != length(fns)) {
        res_all_files <- vector(mode = "list", length = length(fns))
        res_all_files[fromF] <- res
        empties <- which(lengths(res_all_files) == 0)
        ## fill these
        for (i in empties) {
            empty_list <- vector(mode = "list", length = nrow(rt))
            for(j in 1:nrow(rt)) {
                empty_list[[j]] <- MSnbase::Chromatogram(filterMz = mz[j, ],
                                                fromFile = as.integer(i),
                                                aggregationFun = aggregationFun)
            }
            res_all_files[[i]] <- empty_list
        }
        res <- res_all_files
    }
    do.call(cbind, res)
}

##' The function extracts the mode (profile or centroided) from the
##' raw mass spectrometry file by parsing the mzML file directly. If
##' the object `x` stems from any other type of file, `NA`s are
##' returned.
##'
##' This function is much faster than [isCentroided()], which
##' estimates mode from the data, but is limited to data stemming from
##' mzML files which are still available in their original location
##' (and accessed with `fileNames(x)`).
##'
##' @title Get mode from mzML data file
##' @param x An object of class [OnDiskMSnExp-class].
##' @return A named `logical` vector of the same length as `x`.
##' @md
##' @author Laurent Gatto
##' @examples
##' library("msdata")
##' f <- proteomics(full.names = TRUE,
##'                 pattern = "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML.gz")
##' x <- readMSData(f, mode = "onDisk")
##' table(isCentroidedFromFile(x), msLevel(x))
isCentroidedFromFile <- function(x) {
    stopifnot(inherits(x, "OnDiskMSnExp"))
    fs <- fileNames(x)
    if (!all(fex <- file.exists(fs)))
        stop(sum(!fex), " files not found.")
    ## iterate over all files
    res <- vector("list", length = length(fs))
    for (i in seq_along(fs)) {
        f <- fs[i]
        if (.fileExt(f) != "mzML")
            .res <- rep(NA, length(x))
        else .res <- .isCentroidedFromFile(f)
        names(.res) <- formatFileSpectrumNames(i, seq(length(.res)))
        ## keep only features in x
        k <- match(featureNames(filterFile(x, i)), names(.res))
        res[[i]] <- .res[k]
    }
    res <- unlist(res)
    ## reorder
    res[featureNames(x)]
}

#' @description small helper function to efficiently split an OnDiskMSnExp by
#'     file avoiding costly validation calls.
#'
#' @author Johannes Rainer
#'
#' @noRd
.on_disk_split_by_file <- function(x) {
    a <- new("OnDiskMSnExp")
    expd <- new("MIAPE")
    procd <- x@processingData
    create_object <- function(i, x) {
        slot(expd, "instrumentManufacturer", check = FALSE) <-
            x@experimentData@instrumentManufacturer[i]
        slot(expd, "instrumentModel", check = FALSE) <-
            x@experimentData@instrumentModel[i]
        slot(expd, "ionSource", check = FALSE) <- x@experimentData@ionSource[i]
        slot(expd, "analyser", check = FALSE) <- x@experimentData@analyser[i]
        slot(expd, "detectorType", check = FALSE) <-
            x@experimentData@detectorType[i]
        slot(procd, "files", check = FALSE) <- x@processingData@files[i]
        slot(a, "processingData", check = FALSE) <- procd
        slot(a, "featureData", check = FALSE) <- extractROWS(
            x@featureData, which(x@featureData$fileIdx == i))
        if (!nrow(a@featureData))
            stop("No spectra present.", call. = FALSE)
        a@featureData$fileIdx <- 1L
        slot(a, "experimentData", check = FALSE) <- expd
        slot(a, "spectraProcessingQueue", check = FALSE) <-
            x@spectraProcessingQueue
        slot(a, "phenoData", check = FALSE) <- x@phenoData[i, , drop = FALSE]
        a
    }
    lapply(seq_along(fileNames(x)), create_object, x = x)
}

Try the MSnbase package in your browser

Any scripts or data that you put into this service are public.

MSnbase documentation built on Jan. 23, 2021, 2 a.m.