R/prepare_libraries_rt.R

Defines functions prepare_libraries_rt

Documented in prepare_libraries_rt

#' @title Prepare libraries of retention times
#'
#' @description This function prepares retention times libraries
#'    to be used for later
#'
#' @include get_params.R
#' @include import_spectra.R
#'
#' @param mgf_exp MGF containing experimental retention times
#' @param mgf_is MGF containing in silico predicted retention times
#' @param temp_exp File containing experimental retention times
#' @param temp_is File containing in silico predicted retention times
#' @param output_rt Output retention time file
#' @param output_sop Output pseudo sop file
#' @param col_ik Name of the InChIKey in mgf
#' @param col_rt Name of the retention time in mgf
#' @param col_sm Name of the SMILES in mgf
#' @param name_inchikey Name of the InChIKey in file
#' @param name_rt Name of the retention time in file
#' @param name_smiles Name of the SMILES in file
#' @param unit_rt Unit of the retention time. Must be "seconds" or "minutes"
#'
#' @return The path to the prepared retention time library
#'
#' @export
#'
#' @examples
#' \dontrun{
#' tima:::copy_backbone()
#' go_to_cache()
#' prepare_libraries_rt()
#' unlink("data", recursive = TRUE)
#' }
prepare_libraries_rt <-
  function(mgf_exp = get_params(step = "prepare_libraries_rt")$files$libraries$temporal$exp$mgf,
           mgf_is = get_params(step = "prepare_libraries_rt")$files$libraries$temporal$is$mgf,
           temp_exp = get_params(step = "prepare_libraries_rt")$files$libraries$temporal$exp$csv,
           temp_is = get_params(step = "prepare_libraries_rt")$files$libraries$temporal$is$csv,
           output_rt = get_params(step = "prepare_libraries_rt")$files$libraries$temporal$prepared,
           output_sop = get_params(step = "prepare_libraries_rt")$files$libraries$sop$prepared$rt,
           col_ik = get_params(step = "prepare_libraries_rt")$names$mgf$inchikey,
           col_rt = get_params(step = "prepare_libraries_rt")$names$mgf$retention_time,
           col_sm = get_params(step = "prepare_libraries_rt")$names$mgf$smiles,
           name_inchikey = get_params(step = "prepare_libraries_rt")$names$inchikey,
           name_rt = get_params(step = "prepare_libraries_rt")$names$rt$library,
           name_smiles = get_params(step = "prepare_libraries_rt")$names$smiles,
           unit_rt = get_params(step = "prepare_libraries_rt")$units$rt) {
    ## default transforms from `Spectra`
    if (col_rt == "RTINSECONDS") {
      col_rt <- "rtime"
    }
    if (col_sm == "SMILES") {
      col_sm <- "smiles"
    }

    full_or_null <- function(x) {
      if (length(x) == 0) {
        NULL
      } else {
        x
      }
    }
    mgf_exp <- mgf_exp |>
      purrr::map(.f = full_or_null)
    mgf_exp <- mgf_exp[mgf_exp |>
      purrr::map(
        .f = function(x) {
          !is.null(x)
        }
      ) |>
      unlist()]
    mgf_is <- mgf_is |>
      purrr::map(.f = full_or_null)
    mgf_is <- mgf_is[mgf_is |>
      purrr::map(
        .f = function(x) {
          !is.null(x)
        }
      ) |>
      unlist()]
    mgf_exp <- mgf_exp |>
      purrr::map(.f = full_or_null)
    temp_exp <- temp_exp[temp_exp |>
      purrr::map(
        .f = function(x) {
          !is.null(x)
        }
      ) |>
      unlist()]
    mgf_is <- mgf_is |>
      purrr::map(.f = full_or_null)
    temp_is <- temp_is[temp_is |>
      purrr::map(
        .f = function(x) {
          !is.null(x)
        }
      ) |>
      unlist()]
    if (length(mgf_exp) == 0) {
      mgf_exp <- NULL
    }
    if (length(mgf_is) == 0) {
      mgf_is <- NULL
    }
    if (length(temp_exp) == 0) {
      temp_exp <- NULL
    }
    if (length(temp_is) == 0) {
      temp_is <- NULL
    }

    rts_from_mgf <-
      function(mgf) {
        log_debug("Importing spectra ...")
        spectra <- mgf |>
          purrr::map(.f = import_spectra)
        log_debug("Extracting retention times...")
        rts <- spectra |>
          purrr::map(.f = function(x) {
            x@backend@spectraData |>
              data.frame() |>
              tidytable::as_tidytable()
          }) |>
          tidytable::bind_rows() |>
          tidytable::select(tidyselect::any_of(c(
            rt = col_rt,
            inchikey = col_ik,
            smiles = col_sm
          )))
        return(rts)
      }

    rts_from_tab <-
      function(tab) {
        log_debug("Importing file ...")
        rts <- tab |>
          purrr::map(.f = tidytable::fread) |>
          tidytable::bind_rows() |>
          tidytable::select(tidyselect::any_of(
            c(
              rt = name_rt,
              inchikey = name_inchikey,
              smiles = name_smiles
            )
          ))
        return(rts)
      }

    polish_df <-
      function(df, type = "experimental", unit = unit_rt) {
        df_polished <- df |>
          data.frame() |>
          tidytable::mutate(type = type) |>
          tidytable::rowwise() |>
          tidytable::mutate(rt = tidytable::if_else(condition = unit == "seconds", true = as.numeric(rt) / 60, false = as.numeric(rt))) |>
          tidytable::bind_rows(data.frame(inchikey = NA_character_, smiles = NA_character_)) |>
          tidytable::filter(!is.na(rt)) |>
          tidytable::filter(!is.na(smiles)) |>
          tidytable::distinct()
        return(df_polished)
      }

    complete_df <- function(df) {
      df_full <- df |>
        tidytable::filter(!is.na(inchikey))
      df_missing <- df |>
        tidytable::filter(is.na(inchikey))
      log_debug(
        "There are",
        nrow(df_missing),
        "entries without InChIKey.",
        "We would recommend you adding them but will try completing.",
        "We will query them on the fly, this might take some time."
      )
      ## ISSUE see #19
      smiles <- unique(df_missing$smiles)

      get_inchikey <- function(smiles, toolkit = "rdkit") {
        url <- paste0(
          "https://api.naturalproducts.net/latest/convert/inchikey?smiles=",
          utils::URLencode(smiles),
          "&toolkit=",
          toolkit
        )
        tryCatch(
          expr = jsonlite::fromJSON(txt = url),
          error = function(e) {
            return(NA_character_)
          }
        )
      }

      inchikey <- purrr::map(.x = smiles, .f = get_inchikey) |>
        as.character()
      df_missing <- df_missing |>
        tidytable::select(-inchikey) |>
        tidytable::left_join(tidytable::tidytable(smiles, inchikey))
      rm(smiles)

      df_completed <- df_full |>
        tidytable::bind_rows(df_missing) |>
        tidytable::mutate(tidytable::across(
          .cols = tidyselect::where(is.character),
          .fns = function(x) {
            tidytable::na_if(x, "NA")
          }
        ))
      rm(df_full, df_missing)
      df_completed <- df_completed |>
        tidytable::select(rt,
          structure_smiles = smiles,
          structure_inchikey = inchikey,
          type
        ) |>
        tidytable::distinct()
      log_debug(
        "There were still",
        nrow(df_completed |>
          tidytable::filter(is.na(
            structure_inchikey
          ))),
        "entries for which no InChIKey could not be found in the end."
      )
      return(df_completed)
    }

    empty_df <- tidytable::tidytable(
      rt = NA_real_,
      structure_smiles = NA_character_,
      structure_inchikey = NA_character_,
      type = NA_character_
    )

    ## from mgf
    if (!is.null(mgf_exp)) {
      rts_exp_1 <- mgf_exp |>
        rts_from_mgf() |>
        polish_df() |>
        complete_df()
    } else {
      rts_exp_1 <- empty_df
    }
    if (!is.null(mgf_is)) {
      rts_is_1 <- mgf_is |>
        rts_from_mgf() |>
        polish_df(type = "predicted") |>
        complete_df()
    } else {
      rts_is_1 <- empty_df
    }

    ## from csv
    if (!is.null(temp_exp)) {
      rts_exp_2 <- temp_exp |>
        rts_from_tab() |>
        polish_df() |>
        complete_df()
    } else {
      rts_exp_2 <- empty_df
    }
    if (!is.null(temp_is)) {
      rts_is_2 <- temp_is |>
        rts_from_tab() |>
        polish_df(type = "predicted") |>
        complete_df()
    } else {
      rts_is_2 <- empty_df
    }

    df_rts <- tidytable::bind_rows(rts_exp_1, rts_exp_2, rts_is_1, rts_is_2) |>
      tidytable::filter(!is.na(as.numeric(rt))) |>
      tidytable::filter(!is.na((structure_inchikey)))
    rm(rts_exp_1, rts_exp_2, rts_is_1, rts_is_2)

    sop <- df_rts |>
      tidytable::select(structure_smiles, structure_inchikey) |>
      tidytable::distinct() |>
      tidytable::mutate(
        structure_inchikey_no_stereo = stringi::stri_sub(str = structure_inchikey, from = 1, to = 14),
        organism_name = NA_character_
      )

    rts <- df_rts |>
      tidytable::select(-structure_smiles) |>
      tidytable::distinct() |>
      tidytable::mutate(
        candidate_structure_inchikey_no_stereo = gsub(
          pattern = "-.*",
          replacement = "",
          x = structure_inchikey,
          perl = TRUE
        )
      ) |>
      ## TODO REMINDER FOR NOW
      tidytable::distinct(rt, candidate_structure_inchikey_no_stereo, type)
    rm(df_rts)

    if (nrow(rts) == 0) {
      log_debug("No retention time library found, returning empty organism table.")
      sop <- tidytable::tidytable(
        structure_smiles = NA_character_,
        structure_inchikey = NA_character_,
        structure_inchikey_no_stereo = NA_character_,
        organism_name = NA_character_
      )
    }

    if (nrow(rts) == 0) {
      log_debug("No retention time library found, returning empty retention time table.")
      rts <- tidytable::tidytable(
        rt = NA_real_,
        candidate_structure_inchikey_no_stereo = NA_character_,
        type = NA_character_
      )
    }

    tima:::export_params(
      parameters = get_params(step = "prepare_libraries_rt"),
      step = "prepare_libraries_rt"
    )
    tima:::export_output(x = rts, file = output_rt)
    tima:::export_output(x = sop, file = output_sop)
    rm(rts, sop)
    return(c("rt" = output_rt, "sop" = output_sop))
  }
taxonomicallyinformedannotation/tima-r documentation built on Jan. 25, 2025, 12:43 p.m.