R/combineFeatures.R

Defines functions robustSummary aggvar combineMatrixFeatures combineFeaturesV combineFeaturesL .combineFeatures

Documented in aggvar

setMethod("combineFeatures", "MSnSet",
          function(object,
                   groupBy,
                   method = c("mean",
                              "median",
                              "weighted.mean",
                              "sum",
                              "medpolish",
                              "robust",
                              "iPQF",
                              "NTR"),
                   fcol,
                   redundancy.handler = c("unique", "multiple"),
                   cv = TRUE,
                   cv.norm = "sum",
                   verbose = isMSnbaseVerbose(),
                   fun,
                   ...)
              .combineFeatures(object, groupBy, method, fcol,
                               redundancy.handler, cv, cv.norm, verbose,
                               fun, ...))

.combineFeatures <- function(object, groupBy,
                   method = c("mean",
                              "median",
                              "weighted.mean",
                              "sum",
                              "medpolish",
                              "robust",
                              "iPQF",
                              "NTR"),
                   fcol,
                   redundancy.handler = c("unique", "multiple"),
                   cv, cv.norm, verbose, fun, ...) {
    if (!missing(fun)) {
        .Deprecated(
            msg = "Parameter 'fun' is deprecated. Please use 'method' instead")
        if (missing(method))
            method <- fun
    }
    if (is.character(method))
        method <- match.arg(method)
    if (missing(groupBy)) {
        if (missing(fcol))
            stop("Require either 'groupBy' or 'fcol'.")
        stopifnot(fcol %in% fvarLabels(object))
        groupBy <- fData(object)[, fcol]
    }
    if (anyNA(object)) {
        msg <- "Your data contains missing values. Please read the relevant section in the combineFeatures manual page for details on the effects of missing values on data aggregation."
        message(paste(strwrap(msg), collapse = "\n"))
    }

    if (is.list(groupBy)) {
        if (length(groupBy) != nrow(object))
            stop("'length(groupBy)' must be equal to 'nrow(object)': ",
                 length(groupBy), " != ", nrow(object), ".")
        if (!is.null(names(groupBy))) {
            if (!all(names(groupBy) %in% featureNames(object)))
                stop("'groupBy' names and 'featureNames(object)' don't match.")
            if (!all(names(groupBy) == featureNames(object))) {
                warning("Re-ordering groupBy to match feature names.")
                groupBy <- groupBy[featureNames(object)]
            }
        }
        redundancy.handler <- match.arg(redundancy.handler)
        result <- combineFeaturesL(object, groupBy, method,
                                   redundancy.handler,
                                   cv, cv.norm, verbose, ...)
    } else { ## factor, numeric or character
        result <- combineFeaturesV(object, groupBy, method,
                                   cv, cv.norm, verbose, ...)
    }
    if (validObject(result))
        return(result)
}

combineFeaturesL <- function(object,   ## MSnSet
                             groupBy,  ## list
                             method,
                             redundancy.handler,
                             cv = TRUE,
                             cv.norm = "sum",
                             verbose = isMSnbaseVerbose(),
                             ...    ## additional arguments to method
                             ) {
    ## handling of the redundancy
    if (redundancy.handler == "multiple") {
        expansion.index <- rep(seq_len(nrow(object)), sapply(groupBy, length))
        new.exprs <- exprs(object)[expansion.index, , drop = FALSE]
        rownames(new.exprs) <- NULL
        new.feature.data <- fData(object)[expansion.index, , drop = FALSE]
        rownames(new.feature.data) <- NULL
        ## TODO: check this
        object <- new("MSnSet", exprs = new.exprs,
                      featureData = new(
                          "AnnotatedDataFrame",
                          data = new.feature.data),
                      phenoData = phenoData(object))
        groupBy <- unlist(groupBy)
    } else { ## redundancy.handler == "unique" - checked by match.arg
        idx.unique <- sapply(groupBy, length) < 2
        object <- object[idx.unique, ]
        groupBy <- unlist(groupBy[idx.unique])
    }
    combineFeaturesV(object, groupBy, method, cv, cv.norm, verbose, ...)
}


combineFeaturesV <- function(object,   ## MSnSet
                             groupBy,  ## factor, character or numeric
                             method,
                             cv = TRUE,
                             cv.norm = "sum",
                             verbose = isMSnbaseVerbose(),
                             ...    ## additional arguments to method
                             ) {
    groupBy <- as.character(groupBy)
    if (cv) {
        ## New cv feature variable names
        cvfvars <- paste("CV", sampleNames(object), sep = ".")
        ## If not already present, use as is (i.e no suffix needed)
        if (!any(cvfvars %in% fvarLabels(object))) {
            .suffix <- NULL
        } else {
            ## Add a numeric suffix that isn't already in use
            .suffix <- 0
            while (any(cvfvars %in% fvarLabels(object))) {
                .suffix <- .suffix + 1
                cvfvars <- paste(cvfvars, .suffix, sep = ".")
            }
        }
        cv.mat <- featureCV(object, groupBy = groupBy,
                            norm = cv.norm,
                            suffix = .suffix)
    }
    n1 <- nrow(object)
    ## !! order of features in matRes is defined by the groupBy factor !!
    if (is.character(method) && method == "iPQF") {
        ## NB: here, we pass the object, not only assay data,
        ##     because iPGF also needs the feature data, otherwise
        ##     not passed and used in combineFeatureMatrix
        ##     iPQF still returns a matrix, though.
        matRes <- iPQF(object, groupBy, ...)
    } else if (is.character(method) && method == "NTR") {
        matRes <- normToReference(exprs(object), group=groupBy, ...)
        ## order matrix according to groupBy (is important because rownames are
        ## overwritten a few lines below
        matRes <- matRes[order(unique(groupBy)), , drop=FALSE]
    } else {
        matRes <- as.matrix(combineMatrixFeatures(exprs(object),
                                                  groupBy, method,
                                                  verbose = verbose,
                                                  ...))
    }
    fdata <- fData(object)[!duplicated(groupBy), , drop = FALSE] ## takes the first occurences
    fdata <- fdata[order(unique(groupBy)), , drop = FALSE] ## ordering fdata according to groupBy factor
    rownames(matRes) <- rownames(fdata)
    colnames(matRes) <- sampleNames(object)
    if (cv)
        fdata <- cbind(fdata, cv.mat)
    res <- new("MSnSet", exprs = matRes,
               featureData = new("AnnotatedDataFrame",
                                 data = fdata))
    res@processingData@merged <- TRUE
    res@qual <- object@qual[0, ]
    pData(res) <- pData(object)
    if (is.character(method)) {
        msg <- paste("Combined ", n1, " features into ",
                     nrow(res), " using ", method, sep = "")
    } else {
        msg <- paste("Combined ", n1, " features into ",
                     nrow(res), " using user-defined function",
                     sep = "")
    }
    if (verbose)
        message(msg)
    res@processingData@processing <- c(object@processingData@processing,
                                       paste(msg, ": ", date(), sep = ""))
    ## update feature names according to the groupBy argument
    ## new in version 1.5.9
    fn <- sort(unique(groupBy))
    featureNames(res) <- fn
    if (validObject(res))
        return(res)
}

combineMatrixFeatures <- function(matr,    ## matrix
                                  groupBy, ## char/factor
                                  method,
                                  verbose = isMSnbaseVerbose(),
                                  ...) { ## additional arguments to method
    if (is.character(method)) {
        ## Using a predefined function
        if (method == "medpolish") {
            summarisedFeatures <- by(matr,
                                     groupBy,
                                     function(x) {
                                         medpol <- medpolish(x, trace.iter = verbose, ...)
                                         return(medpol$overall + medpol$col)
                                     })
        } else if (method == "robust") {
            summarisedFeatures <- by(matr, groupBy, robustSummary, ...)
        } else if (method == "weighted.mean") {
            ## Expecting 'w' argument
            args <- list(...)
            if (is.null(args$w))
                stop("Expecting a weight parameter 'w' for 'weigthed mean'.")
            w <- args$w
            if (is.null(colnames(matr)))
                colnames(matr) <- paste0("X", 1:ncol(matr))
            summarisedFeatures <- apply(matr,2,
                                        function(x) {
                                            .data <- data.frame(x=x, groupBy, w=w)
                                            ddply(.data,
                                                  "groupBy",
                                                  summarise,
                                                  wmn = weighted.mean(x,w))
                                        })
            summarisedFeatures <- do.call(cbind, as.list(summarisedFeatures))
            rn <- summarisedFeatures[,1]
            summarisedFeatures <-
                summarisedFeatures[, grep("wmn", colnames(summarisedFeatures))]
            colnames(summarisedFeatures) <- colnames(matr)
            rownames(summarisedFeatures) <- rn
            return(summarisedFeatures)
        } else {
            ## using either 'sum', 'mean', 'median'
            summarisedFeatures <- by(matr,
                                     groupBy,
                                     function(x) apply(x, 2, eval(parse(text = method)), ...))
        }
    } else {
        ## using user-defined function
        summarisedFeatures <- by(matr,
                                 groupBy,
                                 function(x) apply(x, 2, method, ...))
    }
    return(do.call(rbind, as.list(summarisedFeatures)))
}

##' This function evaluates the variability within all protein group
##' of an \code{MSnSet}. If a protein group is composed only of a
##' single feature, \code{NA} is returned.
##'
##' This function can be used to identify protein groups with
##' incoherent feature (petides or PSMs) expression patterns. Using
##' \code{max} as a function, one can identify protein groups with
##' single extreme outliers, such as, for example, a mis-identified
##' peptide that was erroneously assigned to that protein group. Using
##' \code{mean} identifies more systematic inconsistencies where, for
##' example, the subsets of peptide (or PSM) feautres correspond to
##' proteins with different expression patterns.
##'
##' @title Identify aggregation outliers
##' @param object An object of class \code{MSnSet}.
##' @param groupBy A \code{character} containing the protein grouping
##'     feature variable name.
##' @param fun A function the summarise the distance between features
##'     within protein groups, typically \code{max} or
##'     \code{mean}.\code{median}.
##' @return A \code{matrix} providing the number of features per
##'     protein group (\code{nb_feats} column) and the aggregation
##'     summarising distance (\code{agg_dist} column).
##' @author Laurent Gatto
##' @seealso \code{\link{combineFeatures}} to combine PSMs
##'     quantitation into peptides and/or into proteins.
##' @examples
##' library("pRolocdata")
##' data(hyperLOPIT2015ms3r1psm)
##' groupBy <- "Protein.Group.Accessions"
##' res1 <- aggvar(hyperLOPIT2015ms3r1psm, groupBy, fun = max)
##' res2 <- aggvar(hyperLOPIT2015ms3r1psm, groupBy, fun = mean)
##' par(mfrow = c(1, 3))
##' plot(res1, log = "y", main = "Single outliers (max)")
##' plot(res2, log = "y", main = "Overall inconsistency (mean)")
##' plot(res1[, "agg_dist"], res2[, "agg_dist"],
##'      xlab = "max", ylab = "mean")
aggvar <- function(object, groupBy, fun) {
    stopifnot(inherits(object, "MSnSet"))
    stopifnot(groupBy %in% fvarLabels(object))
    .d <- function(x, fun. = fun) {
        dx <- dist(as.matrix(x))
        if (length(dx) == 0) return(NA)
        do.call(fun., list(dx))
    }
    gb <- fData(object)[, groupBy]
    nr <- by(exprs(object), gb, nrow)
    d <-  by(exprs(object), gb, .d, fun)
    ans <- cbind(agg_dist = d, nb_feats = nr)
    attr(ans, "agg_dist") <- fun
    ans
}

##' This function calculates the robust summarisation for each feature
##' (protein). Note that the function assumes that the intensities in
##' input `e` are already log-transformed.
##'
##' @title Calculate robust expression summary
##' @param e A feature (peptide or spectra) by sample `matrix`
##'     containing the expression data.
##' @param ... Additional parameters passed to `MASS::rlm`
##' @return `numeric()` vector of length `length(expression)` with
##'     robust summarised values.
##' @author Adriaan Sticker, Sebastian Gibb and Laurent Gatto
##' @md
##' @noRd
robustSummary <- function(e, residuals = FALSE, ...) {
    ## If there is only one 1 peptide for all samples return
    ## expression of that peptide
    if (nrow(e) == 1L) return(e)

    ## remove missing values
    p <- !is.na(e)
    expression <- e[p] ## expression becomes a vector
    sample <- rep(colnames(e), each = nrow(e))[p]
    feature <- rep(rownames(e), times = ncol(e))[p]

    ## model.matrix breaks on factors with 1 level so make vector of
    ## ones (intercept).
    if (length(unique(sample)) == 1L) sample <- rep(1, length(sample))

    ## Sum contrast on peptide level so sample effect will be mean
    ## over all peptides instead of reference level.
    X <- stats::model.matrix(~ -1 + sample + feature,
                             contrasts.arg = list(feature = 'contr.sum'))
    ## MASS::rlm breaks on singulare values.
    ## - Check with base lm if singular values are present.
    ## - If so, these coefficients will be zero, remove this collumn
    ##   from model matrix
    ## - Rinse and repeat on reduced modelmatrx till no singular
    ##   values are present
    repeat {
        fit <- stats::.lm.fit(X, expression)
        id <- fit$coefficients != 0
        X <- X[ , id, drop = FALSE]
        if (!any(!id)) break
    }
    ## Last step is always rlm: calculate estimated effects effects as
    ## summarised values
    fit <- MASS::rlm(X, expression, ...)

    sampleid <- seq_along(unique(sample))
    ## This will be needed for NUSE-type of quality control, but will
    ## need to check for missing data as below.
    ## se <- unique(summary(fit)$coefficients[sampleid, 'Std. Error'])

    ## Take the sample coefficients ( = summarised expression values)
    coef  <-  fit$coefficients[sampleid]
    ## Sort the sample coefficients in the same way as the samplenames
    ## of expression matrix. Puts NA for the samples without any
    ## expression value
    coef[paste0('sample', colnames(e))]
}

Try the MSnbase package in your browser

Any scripts or data that you put into this service are public.

MSnbase documentation built on Jan. 23, 2021, 2 a.m.