R/purityD-dims-purity.R

Defines functions stderror covar dimsPredictPuritySingleMz get_mz_sim_scanid dimsPredictPuritySingle predictPurityExp

Documented in dimsPredictPuritySingle

#' @include purityD-constructor.R
NULL

# msPurity R package for processing MS/MS data - Copyright (C)
#
# This file is part of msPurity.
#
# msPurity is a free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msPurity is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msPurity.  If not, see <https://www.gnu.org/licenses/>.


#' @title Using purityD object, assess anticipated purity from a DI-MS run
#'
#' @description
#' Assess the precursor purity of anticpated MS/MS spectra.
#' i.e. it 'predicts' the precursor purity of the DI-MS peaks for a future MS/MS run.
#'
#' @aliases dimsPredictPurity
#'
#' @param Object object = purityD object
#' @param sampleOnly boolean = if TRUE will only calculate purity for sample peaklists
#' @inheritParams dimsPredictPuritySingle
#
#' @return  purityD object with predicted purity of peaks
#' @examples
#'
#' datapth <- system.file("extdata", "dims", "mzML", package="msPurityData")
#' inDF <- Getfiles(datapth, pattern=".mzML", check = FALSE, cStrt = FALSE)
#' ppDIMS <- purityD(fileList=inDF, cores=1, mzML=TRUE)
#' ppDIMS <- averageSpectra(ppDIMS)
#' ppDIMS <- filterp(ppDIMS)
#' ppDIMS <- subtract(ppDIMS)
#' ppDIMS <- dimsPredictPurity(ppDIMS)
#' @return purityD object
#' @seealso \code{\link{dimsPredictPuritySingle}}
#'
#'
#' @export
setMethod(f="dimsPredictPurity", signature="purityD",
          definition= function(Object, ppm = 1.5, minOffset=0.5, maxOffset=0.5,
                               iwNorm=FALSE, iwNormFun=NULL, ilim=0.05, sampleOnly=FALSE,
                               isotopes=TRUE, im=NULL) {
            requireNamespace('foreach')

            Object@purityParam$minOffset = minOffset
            Object@purityParam$maxOffset = minOffset
            Object@purityParam$ppm = ppm
            Object@purityParam$iwNorm = iwNorm
            Object@purityParam$iwNormFun = iwNormFun
            Object@purityParam$ilim = ilim
            Object@purityParam$isotopes =isotopes
            Object@purityParam$im = im

            # Check if multicore
            if (Object@cores>1){
              operator <- foreach::'%dopar%'
              cl<-parallel::makeCluster(Object@cores, type = "SOCK")
              doSNOW::registerDoSNOW(cl)

            }else{
              operator <- foreach::'%do%'
            }

            # Check if only sample peaks required
            if (sampleOnly){
              pidx <- Object@sampleIdx
            }else{
              pidx <- seq(1, nrow(Object@fileList))
            }
            purityPeaksAll <- operator(foreach::foreach(i=1:length(pidx), .packages = "mzR"),
                                       predictPurityExp(Object, pidx[[i]]))

            for (i in 1:length(pidx)){
              Object@avPeaks$processed[[pidx[i]]] <- purityPeaksAll[[i]]
            }

            return(Object)
          })



predictPurityExp <- function(Object, fidx){
  origPeaks <- Object@avPeaks$processed[[fidx]]

  if(nrow(origPeaks)==0){
    return(origPeaks)
  }

  filepth <- as.character(Object@fileList$filepth[fidx])

  # if iwNorm is TRUE and iwNormFun is NULL
  if(is.null(Object@purityParam$iwNormFun)){
    Object@purityParam$iwNormFun <- iwNormRcosine()
  }

  purity <- dimsPredictPuritySingle(mztargets = origPeaks$mz,
                                    filepth = filepth,
                                    minOffset = Object@purityParam$minOffset,
                                    maxOffset = Object@purityParam$maxOffset,
                                    ppm = Object@purityParam$ppm,
                                    mzML = Object@mzML,
                                    iwNorm=Object@purityParam$iwNorm,
                                    iwNormFun=Object@purityParam$iwNormFun,
                                    ilim=Object@purityParam$ilim,
                                    mzRback=Object@purityParam$mzRback,
                                    isotopes=Object@purityParam$isotopes,
                                    im=Object@purityParam$im)

  pPeaks <- cbind(origPeaks, purity)

  return(pPeaks)
}



#' @title Predict the precursor purity from a DI-MS dataset
#'
#' @description
#' Given a an DI-MS dataset (either mzML or .csv file) calculate the predicted
#' purity for a vector of mz values.
#'
#' Calculated at a given offset e.g. for 0.5 +/- Da the minOffset would be 0.5
#' and the maxOffset of 0.5.
#'
#' A ppm tolerance is used to find the target mz value in each scan.
#'
#' @param mztargets vector = mz targets to get predicted purity for
#' @param filepth character = mzML file path or .csv file path
#' @param minOffset numeric = isolation window minimum offset
#' @param maxOffset numeric = isolation window maximum offset
#' @param ppm numeric = tolerance for target mz value in each scan
#' @param mzML boolean = Whether an mzML file is to be used or .csv file (TRUE == mzML)
#' @param iwNorm boolean = if TRUE then the intensity of the isolation window will be normalised based on the iwNormFun function
#' @param iwNormFun function = A function to normalise the isolation window intensity. The default function is very generalised and just accounts for edge effects
#' @param ilim numeric = All peaks less than this percentage of the target peak will be removed from the purity calculation, default is 5% (0.05)
#' @param mzRback character = backend to use for mzR parsing
#' @param isotopes boolean = TRUE if isotopes are to be removed
#' @param im matrix = Isotope matrix, default removes C13 isotopes (single, double and triple bonds)
#' @param sim boolean = TRUE if file is from sim stitch experiment. Default FALSE
#' @examples
#' mzmlPth <- system.file("extdata", "dims", "mzML", "B02_Daph_TEST_pos.mzML",
#'                        package="msPurityData")
#' predicted <- dimsPredictPuritySingle(c(173.0806, 216.1045), filepth=mzmlPth,
#'                              minOffset=0.5, maxOffset=0.5, ppm=5, mzML=TRUE)
#' @return a dataframe of the target mz values and the predicted purity score
#' @export
dimsPredictPuritySingle <- function(mztargets,
                                    filepth,
                                    minOffset=0.5,
                                    maxOffset=0.5,
                                    ppm=2.5,
                                    mzML=TRUE,
                                    iwNorm=FALSE,
                                    iwNormFun=NULL,
                                    ilim=0.05,
                                    mzRback='pwiz',
                                    isotopes=TRUE,
                                    im=NULL,
                                    sim=FALSE){

  # open the file and get the scans
  if(mzML==TRUE){
    # mzML files opened with mzR
    loadNamespace('mzR')
    mr <- mzR::openMSfile(filepth, backend=mzRback)
    scanPeaks <- mzR::peaks(mr)
    h <- mzR::header(mr)

    # only want the ms1 scans
    hms1 <- h[h$msLevel==1,]
    scans <- hms1$seqNum
    rm(h)
    # get peaks from each scan
    scanPeaks <- mzR::peaks(mr)

    if (sim){
      # if file contains sim-stitch we only want to look at sim scans
      meta_info <- get_additional_mzml_meta(filepth)
      scans <- as.numeric(meta_info[meta_info$scanid %in% scans & meta_info$sim==TRUE,]$scanid)
    }else{
      meta_info <- NA
    }

    # only get scans that are required for analysis
    scanPeaks <- scanPeaks[scans]

  }else{
    # MSFileReader outputs opened with as .csv files
    MSfile <- read.csv(filepth)
    scanPeaks <- plyr::dlply(MSfile, ~ scanid, function(x){x[-1]})
  }

  # if iwNorm is TRUE and iwNormFun is NULL then a gaussian model of the
  # isolation window will be used to normalise intensity
  if(is.null(iwNormFun)){
    # Using a gaussian curve 3 SD either side
    iwNormFun <- iwNormGauss(3, -minOffset, maxOffset)
  }

  # perform the purity prediction on each target mz value
  pureList <- lapply(mztargets, dimsPredictPuritySingleMz,
                     scanPeaks=scanPeaks,
                     minOffset=minOffset,
                     maxOffset=maxOffset,
                     ppm=ppm,
                     iwNorm=iwNorm,
                     iwNormFun=iwNormFun,
                     ilim=ilim,
                     isotopes=isotopes,
                     im=im,
                     meta_info=meta_info,
                     sim=sim,
                     scanids=scans)
  puredf <- do.call(rbind.data.frame, pureList)

  colnames(puredf) <- c('medianPurity','meanPurity',
                        'sdPurity', 'cvPurity', 'sdePurity', "medianPeakNum")
  return(puredf)

}

get_mz_sim_scanid <- function(meta_info, mz){


  filt <- meta_info[meta_info$sim ==TRUE &
                         mz > meta_info$scan_window_lower_limit &
                         mz < meta_info$scan_window_upper_limit, ]
  if(nrow(filt)<1){
    return(NA)
  }else{
    scnids <- as.numeric(filt$scanid)
  }

  return(scnids)


}

dimsPredictPuritySingleMz <- function(mz, scanPeaks, minOffset, maxOffset, ppm,
                                      plot=FALSE, plotdirpth, iwNorm=FALSE, iwNormFun=NULL,
                                      ilim=0.05, isotopes=TRUE, im=NULL, meta_info=NA, sim=FALSE, scanids=NA){

  if (is.na(mz)){
    return(rep(NA, 6))
  }

  # Get isolation window
  minmz <- mz-minOffset
  maxmz <- mz+maxOffset

  purityall <- ""
  pknmall <- ""

  if(sim){
    in_range_scanids <- get_mz_sim_scanid(meta_info, mz)
    if (anyNA(in_range_scanids)){
      print('CHECK')
      return(rep(NA, 6))
    }
    scanPeaks <- scanPeaks[scanids %in% in_range_scanids]
  }

  for (i in 1:length(scanPeaks)){


    x <- scanPeaks[[i]]

    pout <- pcalc(x, mzmin=minmz, mzmax=maxmz,
                  mztarget=mz, ppm=ppm, iwNorm=iwNorm,
                  iwNormFun=iwNormFun, ilim=ilim,
                  isotopes=isotopes, im=im)

    purityi <- pout[1]
    pknm <- pout[2]

    if(plot==TRUE){

      png(file.path(plotdirpth, paste("scan_", i, "_", mz,".png",sep="" )),
          width=10,height=10,units="in",res=1200)

      plot(sub, type="h", xlim=c(minmz, maxmz),  xlab="m/z", ylab="Intensity",
           main=paste("Isolation window surrounding m/z value", mz),
           cex.lab=2, cex.axis=2, cex.main=2, cex.sub=2, lwd=4)

      details <- paste("target I =", round(mtchi,0), "\ntotal I =", round(alli,0),
                       "\nPurity (target/total) =",round(purityi,3))
      points(mtch[1], mtch[2], type="h", col="red", lwd=5)
      text(mtch[1], mtch[2], paste(details),pos = 4, cex=2)
      dev.off()
    }

    purityall <- c(purityall, purityi)
    pknmall <- c(pknmall, pknm)

  }

  purityall <- as.numeric(purityall[-1])
  pknmmpall <- as.numeric(pknmall[-1])

  puritySum <- c(median(purityall, na.rm = TRUE), mean(purityall, na.rm=TRUE),
                 sd(purityall, na.rm=TRUE),  covar(purityall), stderror(purityall),
                 median(pknm, na.rm = TRUE))

  return(puritySum)

}

covar <- function(x){ ( 100*sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) )} # CV (otherwise known as RSD)
stderror <- function(x){
  x <- x[!is.na(x)]
  return(sd(x)/sqrt(length(x)))
}
computational-metabolomics/msPurity documentation built on May 13, 2024, 7:36 p.m.