importomics: Unified statistal Modeling of Omics Data

Documented in guess_compounddiscoverer_quantity .read_compounddiscoverer

#---------------------------------------------------------------------------
# 
#                       guess_compounddiscoverer_quantity
#
#---------------------------------------------------------------------------


#' compound discoverer quantity patterns
#' @examples
#' COMPOUNDDISCOVERER_PATTERNS
#' @export
COMPOUNDDISCOVERER_PATTERNS <- c(
    `area`           = '^Area: (.+)$',
    `normalizedarea` = '^Norm\\. Area: (.+)$')

#' Guess compound discoverer quantity from snames
#'
#' @param x character vector
#' @return  string: value from names(COMPOUNDDISCOVERER_PATTERNS)
#' @examples
#' \dontrun{
#' # file
#'     file <- system.file('extdata/fukuda20.proteingroups.txt', package = 'autonomics')
#'     guess_compounddiscoverer_quantity(file)
#' }
#'
#' # character vector
#'     x <- "Area: 20230908_F143_HILICNEG.raw (F11)"
#'     guess_compounddiscoverer_quantity(x)
#'
#'     x <- "Norm. Area: 20230908_F143_HILICNEG.raw (F11)"
#'     guess_compounddiscoverer_quantity(x)
#' @export
guess_compounddiscoverer_quantity <- function(x){
# Assert
    assert_is_character(x)
# read if x is filename
    if (is_scalar(x)){
        if (is_existing_file(x)){  
            x <- names(fread(x, header = TRUE, nrows = 1))
        }
    }
# guess from character vector
    for (quantity in names(COMPOUNDDISCOVERER_PATTERNS)){
        pattern <- COMPOUNDDISCOVERER_PATTERNS[[quantity]]
        if (any(stri_detect_regex(x, pattern)))   return(quantity)
    }
    stop('quantity could not be infered')
}


#---------------------------------------------------------------------------
# 
#                       .read_compounddiscoverer
#
#---------------------------------------------------------------------------

#' Read compound discoverer files as-is
#' @param file               compoumd discoverer file
#' @param quantity           string
#' @param mod_extract        function to extract MS modi from sample names
#' @param colname_format     function to reformat column names
#' @param verbose            TRUE / FALSE
#' @return data.table
#' @export
.read_compounddiscoverer <- function(
    file, quantity = guess_compounddiscoverer_quantity(file), 
    colname_format = NULL,
    mod_extract = NULL,
    verbose = TRUE){
    . <- Name <- Value <- rowid <- Acquisition <- NULL
# Assert
    assert_compounddiscoverer_output(file)
    assert_is_subset(quantity, names(COMPOUNDDISCOVERER_PATTERNS))
    if (!is.null(colname_format)) assert_is_function(colname_format)
    assert_is_function(mod_extract)
# Read
    if (verbose)  cmessage('%sRead%scompound discoverer output   %s', spaces(4), spaces(6), file)
    cddt <- fread(file, integer64 = 'numeric') %>% 
      un_int64()
    n0 <- nrow(cddt)
# Determine MS modus
    cddt %<>%
      dplyr::mutate(
        `MS Modus` = mod_extract(names(.)) %>%
          unique() %>%
          assert_are_same_length(1))
# Determine ID qualifier
    cddt %<>%
      dplyr::rowwise() %>%
      dplyr::mutate(
        `ID Quality` =  if (
          all(is.na(dplyr::c_across(dplyr::ends_with("Best Match")))))
          { NA } else {
            max(
              dplyr::c_across(dplyr::ends_with("Best Match")), na.rm = TRUE) },
        `ID Quality` =  dplyr::case_when(
          `ID Quality` > 80    ~ 2,
          `ID Quality` > 60    ~ 3,
          TRUE                 ~ 4)) %>%
      dplyr::ungroup()
# Extract require columns
    pattern <- COMPOUNDDISCOVERER_PATTERNS[[quantity]]
    anncols <- c(
      'Name', 'Compounds ID', 'MS Modus', 'Formula', 'Annot. DeltaMass [ppm]',
      'Calc. MW', 'm/z', 'RT [min]', 'ID Quality', 'mzCloud Best Match',
      'mzVault Best Match')
    anncols %<>% intersect(names(cddt))
    valcols <- grep(pattern, names(cddt), value = TRUE)
    cddt %<>% magrittr::extract(, c(anncols, valcols), with = FALSE)
# Format column names
    if (!is.null(colname_format))
      cddt %<>% magrittr::set_colnames(colname_format(names(.)))
# Uniquify Name; choose detection with largest area; DON'T use `Compounds ID`, CompoundDiscoverer produces multiple IDs per trivial name
    cddt %<>%
      dplyr::mutate(rowid = dplyr::row_number()) %>%
      dplyr::relocate(rowid) %>%
      tidyr::pivot_longer(
        cols      = dplyr::matches(COMPOUNDDISCOVERER_PATTERNS[quantity]),
        names_to  = "Acquisition",
        values_to = "Value") %>%
      dplyr::group_by(Name) %>%
      dplyr::arrange(dplyr::desc(Value), by_group = TRUE)
    if (verbose) {
       n <- cddt %>%
         dplyr::summarise(n = length(unique(rowid))) %>% 
         dplyr::filter(n > 1) %>%
         nrow()
       if (n > 0) cmessage('%sMerge %s multiple detects by max %s', spaces(18), n, quantity)
    } 
    cddt %<>%
      dplyr::filter(rowid == rowid[1]) %>%
        dplyr::select(-rowid) %>%
      tidyr::pivot_wider(
        names_from = c(Acquisition),
        values_from = Value) %>%
      dplyr::ungroup() %>%
      as.data.table(integer64 = "numeric")
# Return
    cddt[]
}

#---------------------------------------------------------------------------
#
#                 dequantify_compounddiscoverer
#
#---------------------------------------------------------------------------
#' dequantify_compounddiscoverer compound discoverer snames
#' 
#' Drop quantity.
#' 
#' `Norm. Area: 20230908_F143_HILICNEG.raw (F11)  ->  20230908_F143_HILICNEG.raw (F11)`
#'       `Area: 20230908_F143_HILICNEG.raw (F11)  ->  20230908_F143_HILICNEG.raw (F11)`
#' @param x        `character`
#' @param quantity `'area',              'normalizedarea'`
#' @param verbose  `TRUE` or `FALSE`
#' @return `character`
#' @examples
#' dequantify_compounddiscoverer("Norm. Area: 20230908_F143_HILICNEG.raw (F11)") # Norm. Area
#' dequantify_compounddiscoverer("Area: 20230908_F143_HILICNEG.raw (F11)")       # Area
#' @md
#' @export
dequantify_compounddiscoverer <- function(
    x, quantity = guess_compounddiscoverer_quantity(x), verbose  = FALSE
){
  pattern <- COMPOUNDDISCOVERER_PATTERNS[[quantity]]
  cleanx <- stri_replace_first_regex(x, pattern, '$1')
  if (verbose)  cmessage('%sStandardize snames: %s  ->  %s', spaces(14), x[1], cleanx[1])
  return(cleanx)
}

#' merge compound discoverer files
#' @param x        `list`
#' @param quantity `'area', 'normalizedarea'`
#' @param verbose  `TRUE` or `FALSE`
#' @return `data.table`
#' @export
merge_compounddiscoverer <- function(x, quantity = NULL, verbose = TRUE) {
    Modus <- Name <- Value <- Acquisition <- NULL
    assert_is_list(x)
    assert_has_names(x)
    assert_is_a_string(quantity)
    assert_is_subset(quantity, names(COMPOUNDDISCOVERER_PATTERNS))
    assert_all_have_setidentical_colnames(x)
# Introduce origin identifiers
    x <- lapply(
      names(x),
      function(y){
        x[[y]] %>%
          dplyr::mutate(
            Modus = y,
            ID    = paste(Modus, seq(dplyr::n()), sep = "_"))
      }) %>%
      magrittr::set_names(names(x))
# Ensure proper treatment of data from single source file
    if (length(x) == 1) return(utils::head(x, n = 1))
# Pivot longer and combine
    if (verbose)  cmessage('%sMerge%scompount discoverer output%s(choose origin with max %s)', spaces(4), spaces(5), spaces(3), quantity)
    x %<>%
      lapply(
        tidyr::pivot_longer,
        cols      = dplyr::matches(COMPOUNDDISCOVERER_PATTERNS[quantity]),
        names_to  = "Acquisition",
        values_to = "Value") %>%
      dplyr::bind_rows()
# Processing of merged data
    ## Group by `Name`
    x %<>% dplyr::group_by(Name)
    ## Add column indicating all sources
    x %<>% dplyr::mutate(Modi   = paste(unique(Modus), collapse = ";"))
    ## Identify source providing max. *Area and filter by it
    x %<>%
      dplyr::arrange(dplyr::desc(Value), .by_group = TRUE) %>%
      dplyr::filter(Modus == Modus[1])
# Pivot data wide again
    x %>% 
      tidyr::pivot_wider(
        names_from = c(Acquisition),
        values_from = Value) %>%
      dplyr::ungroup() %>%
      as.data.table(integer64 = "numeric")
}

#---------------------------------------------------------------------------
#
#                   read_compounddiscoverer
#
#---------------------------------------------------------------------------

#' Read compound discoverer output
#' @param dir                      compound discoverer output directory
#' @param files                    compound discoverer output files
#' @param colname_regex            regular expression to parse sample names from column names
#' @param mod_extract              function to extract MS modi from sample names
#' @param colname_format           function to reformat column names
#' @param quantity                 'area', 'normalizedarea' or NULL
#' @param nonames                  TRUE or FALSE: retain compunds without Names?
#' @param exclude_sname_pattern    regular expression of sample names to exclude
#' @param subgroups                NULL or string vector : subgroups to retain
#' @param logbase                  base for logarithmization of the data
#' @param impute                   TRUE or FALSE: impute group-specific NA values?
#' @param plot                     TRUE or FALSE: plot ?
#' @param label                    fvar
#' @param pca                      TRUE or FALSE: run pca ?
#' @param pls                      TRUE or FALSE: run pls ?
#' @param fit                      model engine: 'limma', 'lm', 'lme(r)', 'wilcoxon' or NULL
#' @param formula                  model formula
#' @param block                    model blockvar: string or NULL
#' @param coefs                    model coefficients    of interest: character vector or NULL
#' @param contrasts                coefficient contrasts of interest: character vector or NULL
#' @param palette                  color palette : named character vector
#' @param verbose                  TRUE or FALSE : message ?
#' @return SummarizedExperiment
#' @export
read_compounddiscoverer <- function(
    dir = getwd(),
    files                 = list.files(
                              path = dir, pattern = "(RP|HILIC).*\\.csv$",
                              full.names          = TRUE),
    colname_regex         = "^(.*)\\d{8,8}_+(.*)_+((HILIC|RP)(NEG|POS))\\.raw.*$",
    colname_format        = function(x)
                              stringi::stri_replace_first_regex(
                                x, colname_regex, "$1$2"),
    mod_extract           = function(x)
                              stringi::stri_subset_regex(
                                x, colname_regex) %>%
                                stringi::stri_replace_first_regex(
                                  colname_regex, "$3"),
    quantity              = NULL,
    nonames               = FALSE,
    exclude_sname_pattern = "(blank|QC|RS)",
    subgroups             = NULL,
    logbase               = 2,
    impute                = FALSE,
    plot                  = FALSE,
    label                 = 'feature_id',
    pca                   = plot,
    pls                   = plot,
    fit                   = if (plot) 'limma' else NULL,
    formula               = ~ subgroup,
    block                 = NULL,
    coefs                 = NULL,
    contrasts             = NULL,
    palette               = NULL,
    verbose               = TRUE)
{
    `Compounds ID` <- Name <- NULL
# Select files
    assert_is_a_string(dir)
    assert_all_are_dirs(dir)
    assert_is_not_null(files)
    assert_all_are_greater_than(length(files), 0)
    files <- if (!has_names(files)) {
      magrittr::set_names(files, tools::file_path_sans_ext(basename(files)))
    } else { files }
# Assert
    for (fl in files) assert_compounddiscoverer_output(fl)
    if (!is.null(colname_regex)) assert_is_a_string(colname_regex)
    if (!is.null(colname_format)) assert_is_function(colname_format)
    assert_is_function(mod_extract)
    quantity <- if (is.null(quantity)) {
      sapply(files, guess_compounddiscoverer_quantity) %>% unique()
    } else { quantity }
    assert_is_a_string(quantity)
    assert_is_subset(quantity, names(COMPOUNDDISCOVERER_PATTERNS))
    assert_is_a_bool(nonames)
    if (!is.null(exclude_sname_pattern))
      assert_is_a_string(exclude_sname_pattern)
    assert_is_a_bool(verbose)
    if (!is.null(logbase)) assert_is_a_number(logbase) %>%
      assert_all_are_greater_than(1)
# Read/Merge
    cddt <- lapply(
      files,
      .read_compounddiscoverer, colname_format = colname_format,
      mod_extract = mod_extract, quantity = quantity,
      verbose = verbose) %>%
      merge_compounddiscoverer(quantity = quantity, verbose = verbose)
# Currently intro of `feature_id` only, no `annotate_maxquant` equivalent (yet)
    cddt[, feature_id := Name]
# SumExp
    if (verbose)  cmessage('%sSumExp', spaces(4))
    pattern <- COMPOUNDDISCOVERER_PATTERNS[[quantity]]
    cdmat <- mqdt_to_mat(cddt, pattern, logbase = logbase, verbose = verbose)
    cddt %<>% extract(, names(cddt) %>% setdiff(colnames(cdmat)), with = FALSE)
    object <- list(cdmat) %>%
      magrittr::set_names(paste0('log', logbase, quantity)) %>%
      SummarizedExperiment(rowData = cddt)
# dequantify_compounddiscoverer.
    object$cdcol <- colnames(object)
    colnames(object) %<>%
      dequantify_compounddiscoverer(quantity = quantity, verbose = verbose)
# process_compounddiscoverer
    object %<>% process_compounddiscoverer(
        nonames = nonames, exclude_sname_pattern = exclude_sname_pattern, 
        subgroups = subgroups, impute = impute, verbose = verbose)
    assays <- c(assayNames(object)[1])
    for (assay in assays)  object %<>% add_assay_means(assay)
# Process statistically - if requested
    object %<>% analyze(
        pca       = pca,          pls     = pls,
        fit       = fit,          formula = formula,
        block     = block,        coefs   = coefs,
        contrasts = contrasts,    plot    = plot,
        label     = label,        palette = palette,
        verbose   = verbose )
    object
}

#----------------------------------------------------------------------------
#
#                 process_compounddiscoverer
#
#----------------------------------------------------------------------------

process_compounddiscoverer <- function(
    object, nonames, exclude_sname_pattern, subgroups, impute, verbose
){
    `cdcol` <- `Name` <- NULL
# Deal with nonames
    if (!nonames) object %<>% filter_features(
      !stringi::stri_isempty(Name), verbose = verbose)
# Deal with sample filter strings
    if (!is.null(exclude_sname_pattern)) {
      idx <- object[["cdcol"]] %>%
        stringi::stri_detect_regex(exclude_sname_pattern, negate = TRUE)
      if (verbose)  cmessage(
        '%s%d/%d retained when filtering against %s in sample names',
        spaces(14), sum(idx), length(idx),
        paste0("'", exclude_sname_pattern, "'"))
      object %<>% magrittr::extract(,idx)
    }
# Infer Subgroup
    object$sample_id <- colnames(object)
    object %<>% add_subgroup(verbose = verbose)
    cols <- c('sample_id', 'subgroup', 'replicate', 'cdcol') %>%
      intersect(svars(object))
    sdt(object) %<>% pull_columns(cols)
# Samples
    object %<>% filter_samples_available_for_some_feature(verbose = verbose)
    if (!is.null(subgroups)) {
        assert_is_subset(subgroups, as.character(object$subgroup))
        object %<>% filter_samples(subgroup %in% subgroups, verbose = verbose)
    }
# Features
    analysis(object)$nfeatures <- c(nrow(object))
    object %<>% rm_missing_in_all_samples(verbose = verbose)
# Impute
    if (impute) object %<>% impute(plot = FALSE)
    object
}