R/Functions_extract_EICs.R

Defines functions fastPeakShapes subsetEICs .peakArea bestgauss peakFitter peakFunction rawEICm

Documented in bestgauss fastPeakShapes peakFitter peakFunction rawEICm subsetEICs

#' multiEIC
#' 
#' Extract EICs from multiple files for multiple features.
#' 
#' @param rawdata a (named) list of xcmsRaw objects, e.g. generated by 
#' Metaboseek::loadRawM, OR an MSnbase OnDiskMSnExp object 
#' @param mz a data.frame with minimum (column 1) and maximum (column 2) 
#' m/z values in each row.
#' @param rt a data.frame with minimum (column 1) and maximum (column 2) 
#' retention time values (in seconds) in each row.
#' @param rnames names of the rows in the result matrix, typically the 
#' row.names of the feature table, or the filenames if byFile =T.
#' @param byFile a data.frame with columns File and Group , holding file 
#' paths and group names, respectively.
#' @param XIC deprecated
#' @param getgauss additionally, fit a gauss curve to each EIC (time 
#' consuming), defaults to FALSE
#' @param RTcorr if not NULL, this RTcorr object will be used to 
#' adjust retention times.
#' @param workers number of workers for parallel processing
#' @param quickshapes get quickshapes score instead of geegauss results
#' @param SN when calculating peak quality with quickshapes, 
#' reject any peaks with a signal-to-noise ratio than this (intensity range 
#' within observed rt window, MAX/ MEAN INTENSITY!)
#' @param scoreBy specify which column is used to find which EIC to use 
#' for quickshapes
#' 
#' @importFrom BiocParallel SerialParam
#' 
#' @return a list of extracted ion chromatograms that can be read 
#' by \code{\link{EICplot}}, \code{\link{groupPlot}}, \code{\link{EICgeneral}}
#' 
#' @export
multiEIC <- function (rawdata,
                      mz,
                      rt,
                      rnames = row.names(mz),
                      byFile = F,#if true, table will be sorted by rawfile, otherwise by feature
                      XIC = F,
                      getgauss = F,
                      RTcorr = NULL,
                      workers = 1,
                      quickshapes = F,
                      SN = NULL,
                      scoreBy = "intmean"
){
    
    if(is.null(rt)){
        if(class(rawdata)  != "OnDiskMSnExp"){
            sts = lapply(rawdata,slot,"scantime")
            rt <- data.frame(rtmin = rep(min(unname(sapply(sts,min))),nrow(mz)),
                             rtmax = rep(max(unname(sapply(sts,max))),nrow(mz)))
        }
        else{
            rt <- data.frame(rtmin = rep(min(rawdata@featureData@data$retentionTime),nrow(mz)),
                             rtmax = rep(max(rawdata@featureData@data$retentionTime),nrow(mz)))
        }
        
    }
    
    mx <- as.matrix(cbind(mz,rt))
    
    if(nrow(mx) ==1){
        mxl <-unname(as.list(data.frame((mx[,1:2]))))
        rxl <-unname(as.list(data.frame((mx[,3:4]))))}
    else{
        mxl <-unname(as.list(data.frame(t(mx[,1:2]))))
        rxl <-unname(as.list(data.frame(t(mx[,3:4]))))
    }    
    
    if(class(rawdata)  != "OnDiskMSnExp"){
        fx3 <- function(ls, mz, rt, rfile, gauss = getgauss, RTcorrx = NULL){
            
            if(!is.null(RTcorrx)){
                ls$rt <- unname(RTcorrx$corr[[which(basename(RTcorrx$fnames) == basename(rfile@filepath@.Data))]])[ls$scan]
            }else{
                ls$rt <- rfile@scantime[ls$scan]
            }
            
            ls$tic <- rfile@tic[ls$scan]
            ls$mzmin <- mz[1]
            ls$mzmax <- mz[2]
            ls$rtmin <- rt[1]
            ls$rtmax <- rt[2]
            ls$intmax <- max(ls$intensity)
            ls$intmin <- min(ls$intensity)
            ls$intsum <- sum(ls$intensity)
            ls$intmean <- mean(ls$intensity)
            # ls$intmedian <- median(ls$intensity)
            #ls$int25perc <- quantile(ls$intensity, 0.25)
            #ls$intsd <- sd(ls$intensity)/ls$intmean
            if(ls$intmax == 0){ls$maxrun <- 0}
            else{
                runs <- rle(ls$intensity > ls$intmean)
                ls$maxrun <- max(runs$lengths[runs$values])
            }
            
            if(gauss){
                #if(ls$intmedian > 0 && (!is.null(SN) && ls$intmax/(ls$intmedian) > SN)){return(0)}
                middlescans <- as.integer(quantile(seq_along(ls$rt),0.25)):as.integer(quantile(seq_along(ls$rt),0.75)) 
                smallWindowEstimate <- peakFitter(ls$rt[middlescans], ls$intensity[middlescans], median(ls$rt), 0.2, startdepth = 1, maxdepth = 5)
                #allowWorseScore so that if the initial fit with the larger rt window is worse, more fits are still tried:
                return(peakFitter(ls$rt, ls$intensity, median(ls$rt), 0.4, startdepth = 1, maxdepth = 5, best_estimate = smallWindowEstimate, allowWorseScore = T)$cor$estimate)
            }
            return(ls)
        }
        summe <- list()
        if(byFile){
            
            #this works because mxl  and other objects are in the upstream seach path for the function
            summe <- BiocParallel::bplapply(names(rawdata), function(i){
                rawfile <- rawdata[[i]]
                res <- mapply(rawEICm, mzrange = mxl,
                              rtrange = rxl, MoreArgs=list(object=rawfile), SIMPLIFY = F)
                
                res <- t(mapply(fx3, res, mz = mxl,rt=rxl, MoreArgs=list(rfile=rawfile, gauss = getgauss, RTcorrx = RTcorr)))
                
            }, BPPARAM = SnowParam(workers = if(nrow(mz) > 500){workers}else{1}))
            
            return(summe)
            
        }else{
            summe <- lapply(c(1:nrow(mz)), function(i){
                #for(i in c(1:nrow(mz))){
                if(!is.null(rnames)){
                    featname <- rnames[i]}
                else{featname <-i}
                
                res <- mapply(rawEICm, object=rawdata, 
                              MoreArgs=list(mzrange = mxl[[i]], rtrange = rxl[[i]]), SIMPLIFY = F)
                
                res <- t(mapply(fx3, res, rfile=rawdata, MoreArgs=list(mz = mxl[[i]],
                                                                       rt = rxl[[i]],
                                                                       gauss = getgauss,
                                                                       RTcorrx = RTcorr
                )
                ))
                
                if(quickshapes){
                    
                    #find the best signal to noise ratio:
                    if(!is.null(SN)){
                        snrs <- unlist(res[,"intmax"])/unlist(res[,"intmin"])
                        #snrs2 <- unlist(res[,"intmax"])/unlist(res[,"int25perc"])
                        presel <- which(!is.na(snrs) 
                                        #& !is.na(snrs2) 
                                        &  snrs > SN 
                                        #& snrs2 > SN/2
                        )
                    }else{
                        presel <- which(unlist(res[,"intmax"]) > 0)
                    }
                    
                    if(length(presel) < 1){return(0)}
                    
                    # allwidths <- sapply(seq(length( res[presel,"intensity"])), function(n,its,mI){
                    #   
                    #   if(max(its[[n]]) == 0){return(0)}
                    #   
                    #   
                    #   runs <- rle(its[[n]] > mI[[n]])
                    #   return(max(runs$lengths[runs$values]))
                    #   
                    # }, res[presel,"intensity"], res[presel,"intmean"])
                    # 
                    # if(!any(allwidths >= minWidth)){return(0)}
                    
                    # if(selByWidth){
                    #   
                    #   #now select the peak to be analyzed from those that meet the presel condition:
                    #   pretopint <- which.max(allwidths)
                    #   
                    #   topint <- presel[pretopint]}
                    # else{
                    
                    
                    topint <- presel[which.max(res[presel,scoreBy])]
                    
                    # }
                    # }
                    
                    #unnecessary
                    #if(res[[topint,"intmean"]] == 0){return(0)}
                    
                    
                    middlescans <- as.integer(quantile(seq_along(res[[topint,"rt"]]),0.25)):as.integer(quantile(seq_along(res[[topint,"rt"]]),0.75)) 
                    maxmiddle <- res[[topint,"rt"]][middlescans][which.max(res[[topint,"intensity"]][middlescans])]
                    smallWindowEstimate <- peakFitter(res[[topint,"rt"]][middlescans], 
                                                                   res[[topint,"intensity"]][middlescans],
                                                                   #median(res[[topint,"rt"]]),
                                                                   maxmiddle,
                                                                   0.2, startdepth = 1, maxdepth = 5)
                    #allowWorseScore so that if the initial fit with the larger rt window is worse, more fits are still tried:
                    
                    
                    ps <- peakFitter(res[[topint,"rt"]], 
                                                  res[[topint,"intensity"]],
                                                  # median(res[[topint,"rt"]]),
                                                  maxmiddle,
                                                  0.4, startdepth = 1, maxdepth = 5, best_estimate = smallWindowEstimate, allowWorseScore = T)$cor$estimate
                    return(ps)
                    
                }else{
                    return(res)
                }
                
            })
            
            return(summe)
        }
    }else{
        fx4 <- function(onDisk, onDiskChrom, mz, rt, rfile, gauss = getgauss, RTcorrx = NULL){
            
            filenum <- which(onDisk@phenoData@data$sampleNames == basename(rfile))
            selchrom <- onDiskChrom[[which(unlist(lapply(onDiskChrom, slot, "fromFile")) == filenum
                                           & sapply(lapply(onDiskChrom, slot, "filterMz"),min) == min(mz)
                                           & sapply(lapply(onDiskChrom, slot, "filterMz"),max) == max(mz))]]
            scansel <- which(onDisk@featureData@data$fileIdx == filenum
                             & onDisk@featureData@data$retentionTime >= min(rt)
                             & onDisk@featureData@data$retentionTime <= max(rt))
            
            
            ls <- list()
            ls$scan <- onDisk@featureData@data$spIdx[scansel]
            
            ls$intensity <- selchrom@intensity
            ls$rt <- selchrom@rtime
            
            ls$tic <-onDisk@featureData@data$totIonCurrent[scansel]
            
            
            ls$mzmin <- mz[1]
            ls$mzmax <- mz[2]
            ls$rtmin <- rt[1]
            ls$rtmax <- rt[2]
            ls$intmax <- max(ls$intensity)
            ls$intmin <- min(ls$intensity)
            ls$intsum <- sum(ls$intensity)
            ls$intmean <- mean(ls$intensity)
            if(gauss){
                middlescans <- as.integer(quantile(seq_along(ls$rt),0.25)):as.integer(quantile(seq_along(ls$rt),0.75)) 
                smallWindowEstimate <- peakFitter(ls$rt[middlescans], ls$intensity[middlescans], median(ls$rt), 0.2, startdepth = 1, maxdepth = 5)
                #allowWorseScore so that if the initial fit with the larger rt window is worse, more fits are still tried:
                return(peakFitter(ls$rt, ls$intensity, median(ls$rt), 0.4, startdepth = 1, maxdepth = 5, best_estimate = smallWindowEstimate, allowWorseScore = T)$cor$estimate)}
            return(ls)
        }
        
        onDiskChrom <- MSnbase::chromatogram(rawdata,
                                             mz = mxl, 
                                             rt = rxl,
                                             BPPARAM = SerialParam(), #note that the default (BiocParallel::bpparam()) is much slower in the EIC visualization use case
                                             missing = 0,
                                             aggregationFun = "sum")
        
        summe <- list()
        #key step for throughput
        if(byFile){
            
            for(i in as.character(rawdata@phenoData@data$sampleNames)){
                rawfile <- i
                
                summe[[i]] <- t(mapply(fx4, mz = mxl,rt=rxl,
                                       MoreArgs=list(onDisk= rawdata,
                                                     onDiskChrom = onDiskChrom,
                                                     rfile=i, gauss = getgauss,
                                                     RTcorrx = NULL)))
            }
        }else{
            for(i in c(1:nrow(mz))){
                if(!is.null(rnames)){
                    featname <- rnames[i]}
                else{featname <-i}
                summe[[featname]] <- t(mapply(fx4, rfile=as.character(rawdata@phenoData@data$sampleNames), 
                                              MoreArgs=list(onDisk= rawdata,
                                                            onDiskChrom = onDiskChrom,
                                                            mz = mz,
                                                            rt = rt,
                                                            gauss = F,
                                                            RTcorrx = NULL
                                              )))
            }}
        return(summe)
    }
}

#' multiEICplus
#'
#' get EICs for adducts, isotopes and neutral loss masses
#'
#' @inheritParams multiEIC
#' @param adducts numeric() of mass shifts
#' @param ... all other arguments passed on to \code{\link{multiEIC}()}
#' 
#' @return a list of extracted ion chromatograms that can be read 
#' by \code{\link{EICplot}}, \code{\link{groupPlot}}, \code{\link{EICgeneral}},
#' extended with EICs of adduct species
#' 
#' @export
multiEICplus <- function (adducts = c(0,1,2,3),
                          mz,
                          rt,
                          ...
                          #          EICsets = list(mz,
                          #                        rt,
                          #    rnames,
                          #   byFile = F,
                          #  rawdata) #if true, table will be sorted by rawfile, otherwise by feature
                          #
){
    
    liEIC <- list()
    if(is.null(adducts)){adducts <- 0}
    
    for (r in 1:length(adducts)){
        liEIC[[r]] <- mz+adducts[r]
        
    }
    
    res <- sapply(liEIC,multiEIC,rt, ...)
    #  rawdata= EICsets$rawdata,
    
    # rt = EICsets$rt,
    # rnames = EICsets$rnames,
    #byFile = F,#if true, table will be sorted by rawfile, otherwise by feature
    #XIC = F,
    #getgauss = F
    #)
    return(res)
}

#' rawEICm
#' 
#' EICs from an xcmsRaw object, modified version of xcms::rawEIC xcmsRaw method.
#' Major difference: Handling of cases where no scan is in rt range.
#' xcms::rawEIC also drops the last scan within range, rawEICm does not.
#' 
#' @param object xcmsRaw object
#' @param mzrange a range of m/z values (numeric(2)).
#' @param rtrange a range of rt values (numeric(2)).
#' @param scanrange a range of scan number values (numeric(2)).
#' @param viewermode True to change handling of out of range rt values for the Mseek viewer. DEPRECATED (always used)
#' 
#' @return a list with elements \code{scan} and \code{intensity}, representing intensity for an m/z range across scans
#' 
#' @export
rawEICm <- function(object,
                    mzrange = numeric(),
                    rtrange =numeric(),
                    scanrange = numeric(),
                    viewermode = T)  {
    #print(paste(mzrange, rtrange))
    
    if (length(rtrange) >= 2 ) {
        
        if(max(rtrange) <= 0){return(list(scan = 1, intensity = numeric(1)))} #quick fix for extreme cases of rt correction (rtmin and rtmax both negative and then set to 0)
        if(max(rtrange) < min(object@scantime)){return(list(scan = 1, intensity = numeric(1)))} #also fixing behaviour if no scans inside the selected rt range
        
        
        rtrange <- range(rtrange)
        
        #if sccanrange is off, just return EIC for entire range (Viewer only shows the relevant section which then is still empty)
        if(max(object@scantime) < rtrange[2] ){rtrange[2] <- max(object@scantime)}
        if(max(object@scantime) < rtrange[1] ){rtrange[1] <- min(object@scantime)}
        
        
        
        scanidx <- (object@scantime >= rtrange[1]) & (object@scantime <= rtrange[2])
        
        scanrange <- c(match(TRUE, scanidx), length(scanidx) - match(TRUE, rev(scanidx)) + 1 ) # +1 is a fix to include last scan that meets condition, fixes problem if only one scan meets condition
        
        #this is to handle exceptional situations where through retention time correction, the rtmin and rtmax both are between
        if(!any(scanidx) 
           & rtrange[2] <= max(object@scantime) 
           & rtrange[1] >= min(object@scantime)){
            scanrange <- range(c(which.min(abs(object@scantime - rtrange[1])),
                                 which.min(abs(object@scantime - rtrange[2]))))}
        
    }  else if (length(scanrange) < 2){
        scanrange <- c(1, length(object@scantime))}
    else{
        scanrange <- range(scanrange)}
    
    scanrange[1] <- max(1,scanrange[1])
    scanrange[2] <- min(length(object@scantime),scanrange[2]) #this should actually avoid the problem..
    
    if (!is.double(object@env$mz))  object@env$mz <- as.double(object@env$mz)
    if (!is.double(object@env$intensity)) object@env$intensity <- as.double(object@env$intensity)
    if (!is.integer(object@scanindex)) object@scanindex <- as.integer(object@scanindex)
    
    
    .Call("getEIC",object@env$mz,object@env$intensity,object@scanindex,as.double(mzrange),as.integer(scanrange),as.integer(length(object@scantime)), PACKAGE ='xcms' )
    
    
}


#' peakFunction
#' 
#' Function to represent a peak in peakFitter
#' 
#' @param x numeric vector
#' @param theta a numeric(4), see \code{details}
#' 
#' @details:
#' \itemize{
#' \item \code{theta[1]}: numeric value of the expected/initial position of the peak apex (in range of x)
#' \item \code{theta[2]}: numeric value of the expected/initial peak width. Should not be 0!
#' \item \code{theta[3]}: y-scale factor
#' \item \code{theta[4]}: shift along y-axis
#' }
#' 
#' @return a series of values approximately representing a peak shape along x
#'  
peakFunction <- function(x, theta)  { 
    m <- theta[1]; s <- theta[2]; a <- theta[3]; b <- theta[4];
    a*exp(-0.5*((x-m)/s)^2) + b
}


#' peakFitter
#' 
#' fit a curve into a numeric vector. Will recursively try different starting values for the optimizer.
#' 
#' @param x numeric vector to use as inout for fit function (typically series of scan numbers or retention time values)
#' @param y numeric vector to fit the curve, same length as x
#' @param m numeric value of the expected/initial position of the peak apex (in range of x)
#' @param s numeric value of the expected/initial peak width. Should not be 0!
#' @param startdepth defaults to 1, part of limiting the recursion
#' @param maxdepth limits recursion to trying to fit curve \code{maxdepth} times with increasing \code{s} values 
#' @param best_estimate Best estimate from previous iterations - argument carries previous results through recursion
#' @param allowWorseScore if FALSE, recursion ends when an iteration has a worse result than the previous one,
#'  and will return the best result up to that point
#' 
#' @return a list, see \code{details}
#' 
#' @details 
#' Elements in returned list:
#' \itemize{
#' \item \code{depth} integer indicationg recursion depth
#' \item \code{fit} result of using \code{\link[stats]{nls}()} on \code{x} with
#'  \code{\link{peakFunction}()}, followed by \code{\link[stats]{fitted}()}
#' \item \code{cor} result of calling \code{\link[stats]{cor.test}()} on \code{y}
#'  and \code{fit}
#' 
#' }
#' 
#' 
#' @importFrom stats fitted nls cor.test
#' 
#' @export
peakFitter <- function(x, y, m, s, startdepth = 1, maxdepth = 5, best_estimate = 0, allowWorseScore = F){
    
    if(!is.list(best_estimate)){
        best_estimate <- list(depth = startdepth,
                              fit = integer(length(x)),
                              cor = list(estimate = best_estimate,
                                         p.value = 1))
    }
    
    if(startdepth < maxdepth && best_estimate$cor$estimate < 0.999){
        tryCatch({
            fit <- nls(y ~ peakFunction(x,c(m,s,a,b)), data.frame(x,y), start=list(m=m, s=s, a= max(y), b=0))
            fittedFit <- fitted(fit)
            cor <- cor.test(y,fittedFit,method="pearson",use="complete")
            
            
            
            if(cor$estimate > best_estimate$cor$estimate){
                best_estimate <- list(depth = startdepth,
                                      fit = fittedFit,
                                      cor = cor)
                return(peakFitter(x,y,m,s*3, startdepth+1, maxdepth, best_estimate))
            }else{
                
                if(allowWorseScore){
                    return(peakFitter(x,y,m,s*3, startdepth+1, maxdepth, best_estimate, allowWorseScore = T))
                }
                
                return(best_estimate)
            }
            
            
        },
        error = function(e){
            return(peakFitter(x,y,m,s*3, startdepth+1, maxdepth, best_estimate))
        },
        silent = T)
        
    }else{
        return(
            best_estimate
        )
    }
}




#' getgauss
#' 
#' NOTE: function is deprecated, use \code{\link{peakFitter}} where possible
#' 
#' fit a gauss curve into a curve (numeric vector). Note that results will be skewed 
#' if scanrate is low or heterogeneous (e.g. ddMS2 experiments).
#' 
#' @param y numeric() to fit the curve
#' @param pval if p.values of fit is larger than this, will return 0
#' 
#' @return a numeric fit estimate as returned by \code{\link[stats]{cor.test}()}
#' 
#' @importFrom stats fitted nls cor.test
#' 
#' @export
getgauss <- function (y, pval = 1){
    
    
    #substract "baseline"
    y <- y - min(y)
    x <- seq_along(y)
    
    #normalize intensities to 1
    
    y <- if(max(y)>0){y/max(y)}else{y}
    
    #here starts the gaussian test, cf. http://www.metabolomics-forum.com/index.php?topic=1031.0 (Krista Longnecker/Tony Larson)
    #fit gauss and let failures to fit through as corr=1
    ## new approach without xcms functions adapted from here: https://stats.stackexchange.com/questions/70153/linear-regression-best-polynomial-or-better-approach-to-use/70184#70184
    
    
    f <- function(x, theta)  { 
        m <- theta[1]; s <- theta[2]; a <- theta[3]; b <- theta[4];
        a*exp(-0.5*((x-m)/s)^2) + b
    }
    #
    # Estimate some starting values.
    # Do the fit.  (It takes no time at all.)
    fit <- tryCatch({
        nls(y ~ f(x,c(m,s,a,b)), data.frame(x,y), start=list(m=max(x)/2, s=max(x)/10, a= max(y), b=0))
    },
    error = function(e){
        try(nls(y ~ f(x,c(m,s,a,b)), data.frame(x,y), start=list(m=max(x)/2, s=max(x)/4, a= max(y), b=0)), silent = T)
    }, silent = T)
    
    gauss <-  if(class(fit) == "try-error")
    {
        0
    } else
    {
        #calculate correlation of summe$intensity against gaussian fit
        if(length(which(!is.na(y-fitted(fit)))) > 2 &&
           length(!is.na(unique(y)))>2 && length(!is.na(unique(fitted(fit))))>2)
        {
            cor <- NULL
            cor <- try(cor.test(y,fitted(fit),method="pearson",use="complete"), silent = T)
            if(class(fit) != "try-error")
            {
                if(cor$p.value <= pval) cor$estimate else 0
            } else 0
        } else 0
        
    }
    return(gauss)}


#' bestgauss
#' 
#' wrapper function for using \code{multiEIC} with \code{getgauss = TRUE}
#' 
#' @return best gauss curve fit score for any of the files in rawdata for
#'  each feature returned by multiEIC.
#' 
#' @param ... arguments passed on to multiEIC().
#' 
#' 
#' @export
bestgauss <- function(...){
    res <- multiEIC(..., byFile = T, getgauss = T)
    
    return(
        data.frame(Peak_Quality = suppressWarnings({
            
            apply(matrix(unlist(res),ncol = length(res)),1,max, na.rm = T)  
            
        })
        ) 
    )
}  

#' exIntensities
#' 
#' Lightweight variant of multiEIC to extract average in a file, more suitable for large featuretables.
#' 
#' @param rawfile an xcmsRaw object
#' @param mz numeric (): m/z values (same length as nrow(rtw)).
#' @param ppm m/z width (+/- ppm from values defined in mz)
#' @param rtw a data.frame with minimum (column 1) and maximum (column 2) retention time values (in seconds) in each row.
#' @param baselineSubtract subtract baseline when calculating intensities
#' @param SN signal to noise ratio. If not NULL, all peaks with max/min peak intensity below this will be reported as intensity 0. Requires Baselinesubstraction to be off.
#' @param areaMode if TRUE, will calculate peak areas rather than mean intensities
#' 
#' @return numeric vector with intensity values for all features defined by 
#' \code{mz} and \code{rt} in \code{rawfile}
#' 
#' @importFrom Biobase rowMax rowMin
#' @describeIn exIntensities extract intensities base function
#' @export
exIntensities <- function (rawfile,
                           mz,
                           ppm = 5,
                           rtw= data.frame(0,10),
                           baselineSubtract = T,
                           SN = NULL,
                           areaMode = F
                           
){
    
    ##template for RTcorrection implementation later
    # if(!is.null(RTcorrx)){
    #                            ls$rt <- unname(RTcorrx$corr[[which(basename(RTcorrx$fnames) == basename(rfile@filepath@.Data))]])[ls$scan]
    #                          }else{
    #                            ls$rt <- rfile@scantime[ls$scan]
    #                          }
    
    mx <- matrix(data= c(mz-ppm*(mz/1000000),
                         mz+ppm*(mz/1000000),
                         rowMin(as.matrix(rtw)),
                         rowMax(as.matrix(rtw))), nrow= length(mz), ncol=4)
    
    mxl <-unname(as.list(data.frame(t(mx[,1:2]))))
    rxl <-unname(as.list(data.frame(t(mx[,3:4]))))
    
    
    summe <- mapply(rawEICm, mzrange = mxl,
                    rtrange = rxl, MoreArgs=list(object=rawfile, scanrange = numeric(), viewermode = F),
                    SIMPLIFY = F)
    
    #substract "baseline" and get rid of scan#
    fx <- function(x){
        
        if(baselineSubtract){
            intens <- x$intensity-min(x$intensity)
        }
        else{intens <- x$intensity}
        
        if(!is.null(SN)){
            snf <- max(x$intensity)/min(x$intensity)
            if(is.na(snf) | snf < SN){return(0)}
        }
        
        if(!areaMode){return(mean(intens))}
        
        return(.peakArea(rawfile@scantime[x$scan],intens))
        
    }
    
    res <- sapply(summe, fx)
    message(1)
    return(res)}

#' .peakArea
#' 
#' Calculate area under the curve
#' 
#' @param x,y two numeric vectors of equal length
#' @return the area under the curve, a numeric(1)
#' @noRd
.peakArea <- function(x,y){
    dx <- c(diff(x), 0)
    dy <- c(diff(y), 0)
    return(sum(y * dx) + sum(dy * dx)/2)
    }

#' subsetEICs
#'
#' helper function to subset output from multiEICplus
#'
#' @param EIClist matrix or list of EIC objects
#' @param group file grouping information (named list)
#' 
#' @return a list of extracted ion chromatograms that can be read 
#' by \code{\link{EICplot}}, \code{\link{groupPlot}}, \code{\link{EICgeneral}}
#'  
#' @export
subsetEICs <- function(EIClist,
                       group){
    
    maxEIC <- numeric(1)
    maxTIC <- numeric(1)
    ##subset lines
    for(i in 1:length(EIClist)){
        EIClist[[i]] <- matrix(EIClist[[i]][group,],
                               nrow = length(group),
                               ncol = ncol(EIClist[[i]]),
                               dimnames = list(rows = group,
                                               columns = colnames(EIClist[[i]])))
    }
    
    for(n in 1:length(EIClist)){
        #   if(length(group)==1){
        #    maxEIC <- max(maxEIC,unlist(EIClist[[n]][,"intensity"][[1]]))
        #   maxTIC <- max(maxTIC,unlist(EIClist[[n]][,"tic"][[1]]))
        
        #  }else{
        maxEIC <- max(maxEIC,unlist(EIClist[[n]][,"intensity"]))
        maxTIC <- max(maxTIC,unlist(EIClist[[n]][,"tic"]))
        # }
    }
    
    out <- list(EIClist,maxEIC,maxTIC)
    names(out) <- c("EIClist","maxEIC","maxTIC")
    
    return (out)
}

#' fastPeakShapes
#' 
#' Calculate peak quality for features in a feature table across a set of MS data files.
#' Will only try fitting a curve in the sample with highest intensity for each molecular feature.
#' 
#' @param rawdata an xcmsRaw object
#' @param mz numeric (): m/z values (same length as nrow(rtw)).
#' @param ppm m/z width (+/- ppm from values defined in mz)
#' @param rtw a data.frame with minimum (column 1) and maximum (column 2) retention time values (in seconds) in each row.
#' @param workers number of wprkers to use in parallel processing
#' 
#' @return numeric vector with peak quality estimates for each feature 
#' defined by \code{mz} and \code{rt}
#' 
#' @export
fastPeakShapes <- function(rawdata, mz, ppm, rtw, workers = 1){
    
    if(isRunning()){
        try({
            setProgress(value = 0, message = 'Preparing Peak Shape Analysis...')
        })
    }
    
    withCallingHandlers({
        ints <- as.data.frame(lapply(bplapply(rawdata, 
                                              exIntensities, 
                                              mz, ppm, rtw, 
                                              baselineSubtract = F, 
                                              SN = 10,
                                              BPPARAM = bpparam()# SnowParam(workers = if(length(mz) > 10000){workers}else{1})
        ),
        unlist))},
        message = function(m){ if(isRunning()){incProgress(amount = sum(as.numeric(unlist(strsplit(as.character(m$message), split = "\n"))))/length(rawdata))} })
    
    maxind <- apply(ints,1,which.max)
   
    
     if(isRunning()){
       try({
            setProgress(value = 0, message = 'Running Peak Shape Analysis...')
       })
     }
    
    withCallingHandlers({
    scorelist <- bplapply(seq(length(rawdata)), function(n){
        
        
        selfeats <- which(maxind == n)
        
        if(length(selfeats) > 0){
            
          res <- unlist(multiEIC(rawdata= rawdata[n],
                                        mz = data.frame(mzmin = mz[selfeats]-ppm*1e-6*mz[selfeats], mzmax=mz[selfeats]+ppm*1e-6*mz[selfeats]),
                                        rt = rtw[selfeats,],
                                        rnames = NULL,
                                        byFile = F,#if true, table will be sorted by rawfile, otherwise by feature
                                        XIC = F,
                                        getgauss = F,
                                        RTcorr = NULL, 
                                        workers =  1,
                                        quickshapes = T,
                                        SN = 10,
                                        scoreBy = "intmean"))}
        else{ res <- numeric(0)}
        
                message(1)

                return(res)
        
        },
        
        BPPARAM = bpparam()#SnowParam(workers = if(length(mz) > 5000){workers}else{1})
        )},
    message = function(m){ if(isRunning()){incProgress(amount = as.numeric(m$message)/length(rawdata))} })
    
    
    scores <- numeric(length(mz))
    
    for(n in seq(length(rawdata))){
        
        scores[maxind == n] <- scorelist[[n]] 
    }
    
    return(scores)
    
    
}
mjhelf/Metaboseek documentation built on April 23, 2022, 12:09 p.m.