R/construct_database.R

Defines functions construct_database

Documented in construct_database

##------------------------------------------------------------------------------
#' @title Construct in-house or public MS2 database for metID.
#' @description Construct MS2 spectra database according to mzXML data and compound information table (csv format).
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param path Work directory.
#' @param version The version of you database. Default is 0.0.1.
#' @param metabolite.info.name The metabolite information table name, it must be csv format.
#' The demo data can be got from the `demoData` package.
#' Please see https://jaspershen.github.io/metID/articles/metID.html
#' @param source The source of your database.
#' @param link Website link of the source.
#' @param creater Creater name. For example, Xiaotao Shen.
#' @param email email address.
#' @param rt Do the metabolites have RT information or not?. If not, set it as FALSE.
#' @param mz.tol m/z tolerance for the match between metabolites and precursor m/z of MS2 spectra.
#' @param rt.tol RT tolerance for the match between metabolites and precursor m/z of MS2 spectra.
#' @param threads The number of threads
#' @importFrom magrittr %>%
#' @importFrom crayon red yellow green bgRed
#' @importFrom stringr str_detect str_extract
#' @importFrom readr cols
#' @importFrom pbapply pblapply
#' @return A databaseClass object.
#' @seealso The example and demo data of this function can be found
#' https://jaspershen.github.io/metID/articles/metID.html
#' @export


# database0.0.1 <- construct_database(
#   path = ".",
#   version = "0.0.1",
#   metabolite.info.name = "metabolite.info.csv",
#   creater = "dumine",
#   email = "dumine@zju.edu.cn",
#   rt = FALSE,
#   mz.tol = 15,
#   rt.tol = 30,
#   threads = 5
# )



construct_database = function(path = ".",
                              version = "0.0.1",
                              metabolite.info.name = "metabolite.info.csv",
                              source = "Michael Snyder Lab",
                              link = "http://snyderlab.stanford.edu/",
                              creater = "Xiaotao Shen",
                              email = "shenxt1990@163.com",
                              rt = TRUE,
                              mz.tol = 15,
                              rt.tol = 30,
                              threads = 3) {
  ##check data first
  file <- dir(path)
  if (all(file != metabolite.info.name)) {
    cat(crayon::red("No", metabolite.info.name, "in your", path, "\n"))
    return(NULL)
  }
  
  if (all(file != "POS")) {
    cat(crayon::red("No POS file in your", path, "\n"))
  } else{
    file_pos <- dir(file.path(path, "POS"))
    if (length(file_pos) == 0) {
      cat(crayon::red("No mzXML files in POS folder\n"))
    } else{
      if (sum(stringr::str_detect(file_pos, "mzXML")) == 0) {
        cat(crayon::red("No mzXML files in POS folder\n"))
      }
    }
  }
  
  if (all(file != "NEG")) {
    cat(crayon::red("No NEG file in your", path, "\n"))
  } else{
    file_neg <- dir(file.path(path, "NEG"))
    if (length(file_neg) == 0) {
      cat(crayon::red("No mzXML files in NEG folder\n"))
    } else{
      if (sum(stringr::str_detect(file_neg, "mzXML")) == 0) {
        cat(crayon::red("No mzXML files in NEG folder\n"))
      }
    }
  }
  
  ##read metabolite information
  cat(crayon::green("Reading metabolite information...\n"))
  metabolite.info <-
    readTable(file = file.path(path, metabolite.info.name))
  
  cat(crayon::green("Reading positive MS2 data...\n"))
  
  file.pos <-
    dir(file.path(path, 'POS'), full.names = TRUE)
  
  if (length(file.pos) > 0) {
    ms2.data.pos <-
      read_mzxml(file = file.pos, threads = threads)
    
    ms1.info.pos <- lapply(ms2.data.pos, function(x) {
      x[[1]]
    })
    
    ms1.info.pos <- do.call(rbind, ms1.info.pos) %>%
      as.data.frame()
    
    ms1.info.pos$file <- basename(ms1.info.pos$file)
    
    ms2.info.pos <- lapply(ms2.data.pos, function(x) {
      x[[2]]
    })
    
    rm(list = "ms2.data.pos")
    
    cat(crayon::red("OK\n"))
  } else{
    ms1.info.pos <- NULL
  }
  
  cat(crayon::green("Reading negative MS2 data...\n"))
  
  file.neg <-
    dir(file.path(path, 'NEG'), full.names = TRUE)
  
  if (length(file.neg) > 0) {
    ms2.data.neg <-
      read_mzxml(file = file.neg, threads = threads)
    
    ms1.info.neg <- lapply(ms2.data.neg, function(x) {
      x[[1]]
    })
    
    ms1.info.neg <- do.call(rbind, ms1.info.neg) %>%
      as.data.frame()
    
    ms1.info.neg$file <- basename(ms1.info.neg$file)
    
    ms2.info.neg <- lapply(ms2.data.neg, function(x) {
      x[[2]]
    })
    
    rm(list = "ms2.data.neg")
    cat(crayon::red("OK\n"))
  } else{
    ms1.info.neg <- NULL
  }
  
  ###---------------------------------------------------------------------------
  cat(crayon::green("Matching metabolites with MS2 spectra (positive)...\n"))
  
  if (!is.null(ms1.info.pos)) {
    match.result.pos <-
      SXTMTmatch(
        data1 = as.data.frame(metabolite.info[, c("mz.pos", "RT")]),
        data2 = ms1.info.pos[, c(2, 3)],
        mz.tol = mz.tol,
        rt.tol = rt.tol,
        rt.error.type = "abs"
      )
    
    match.result.pos <-
      data.frame(match.result.pos,
                 "file" = ms1.info.pos$file[match.result.pos[, 2]],
                 stringsAsFactors = FALSE)
    
    unique.idx1 <- unique(match.result.pos[, 1])
    
    spectra.pos <-
      pbapply::pblapply(unique.idx1, function(idx) {
        temp.match.result.pos <-
          match.result.pos[which(match.result.pos$Index1 == idx), , drop = FALSE]
        if (nrow(temp.match.result.pos) == 0) {
          return(NULL)
        }
        temp.submitter <- metabolite.info$Submitter[idx]
        if (!is.na(temp.submitter) &
            length(grep(temp.submitter, temp.match.result.pos[, 9])) > 0) {
          temp.match.result.pos <-
            temp.match.result.pos[grep(temp.submitter, temp.match.result.pos[, 9]),]
        }
        
        if (nrow(temp.match.result.pos) == 0) {
          return(NULL)
        }
        
        if (nrow(temp.match.result.pos) == 1) {
          temp.ms2.pos <- ms2.info.pos[temp.match.result.pos[1, 2]]
          names(temp.ms2.pos) <-
            stringr::str_extract(string = temp.match.result.pos[1, 9],
                                 pattern = "NCE[0-9]{1,3}")
          return(temp.ms2.pos)
        }
        
        unique.file.name <-
          unique(temp.match.result.pos$file)
        
        temp.ms2.pos <-
          lapply(unique.file.name, function(temp.name) {
            temp.x <-
              temp.match.result.pos[which(temp.match.result.pos$file == temp.name), , drop = FALSE]
            temp.idx <-
              which.max(unlist(lapply(ms2.info.pos[temp.x[, 2]], function(y) {
                sum(y[, 2])
              })))
            ms2.info.pos[[temp.x[temp.idx, 2]]]
          })
        
        names(temp.ms2.pos) <-
          stringr::str_extract(string = unique.file.name,
                               pattern = "NCE[0-9]{1,3}")
        temp.ms2.pos
      })
    
    names(spectra.pos) <-
      metabolite.info$Lab.ID[unique.idx1]
    
    spectra.pos <-
      spectra.pos[which(!unlist(lapply(spectra.pos, is.null)))]
    
    cat(crayon::red("OK\n"))
  } else{
    spectra.pos <- NULL
  }
  
  ###---------------------------------------------------------------------------
  cat(crayon::green("Matching metabolites with MS2 spectra (negative)...\n"))
  if (!is.null(ms1.info.neg)) {
    match.result.neg <-
      SXTMTmatch(
        data1 = as.data.frame(metabolite.info[, c("mz.neg", "RT")]),
        data2 = ms1.info.neg[, c(2, 3)],
        mz.tol = mz.tol,
        rt.tol = rt.tol,
        rt.error.type = "abs"
      )
    
    match.result.neg <- data.frame(match.result.neg,
                                   "file" = ms1.info.neg$file[match.result.neg[, 2]],
                                   stringsAsFactors = FALSE)
    
    unique.idx1 <- unique(match.result.neg[, 1])
    
    spectra.neg <-
      pbapply::pblapply(unique.idx1, function(idx) {
        temp.match.result.neg <-
          match.result.neg[which(match.result.neg$Index1 == idx), , drop = FALSE]
        if (nrow(temp.match.result.neg) == 0) {
          return(NULL)
        }
        
        temp.submitter <- metabolite.info$Submitter[idx]
        if (!is.na(temp.submitter) &
            length(grep(temp.submitter, temp.match.result.neg[, 9])) > 0) {
          temp.match.result.neg <-
            temp.match.result.neg[grep(temp.submitter, temp.match.result.neg[, 9]),]
        }
        
        if (nrow(temp.match.result.neg) == 0) {
          return(NULL)
        }
        
        if (nrow(temp.match.result.neg) == 1) {
          temp.ms2.neg <- ms2.info.neg[temp.match.result.neg[1, 2]]
          names(temp.ms2.neg) <-
            stringr::str_extract(string = temp.match.result.neg[1, 9],
                                 pattern = "NCE[0-9]{1,3}")
          return(temp.ms2.neg)
        }
        
        unique.file.name <-
          unique(temp.match.result.neg$file)
        
        temp.ms2.neg <-
          lapply(unique.file.name, function(temp.name) {
            temp.x <-
              temp.match.result.neg[which(temp.match.result.neg$file == temp.name), , drop = FALSE]
            temp.idx <-
              which.max(unlist(lapply(ms2.info.neg[temp.x[, 2]], function(y) {
                sum(y[, 2])
              })))
            ms2.info.neg[[temp.x[temp.idx, 2]]]
          })
        
        names(temp.ms2.neg) <-
          stringr::str_extract(string = unique.file.name,
                               pattern = "NCE[0-9]{1,3}")
        temp.ms2.neg
      })
    
    names(spectra.neg) <-
      metabolite.info$Lab.ID[unique.idx1]
    
    spectra.neg <-
      spectra.neg[which(!unlist(lapply(spectra.neg, is.null)))]
    cat(crayon::red("OK\n"))
  } else{
    spectra.neg <- NULL
  }
  
  if (is.null(spectra.pos) & is.null(spectra.neg)) {
    Spectra <- list()
  } else{
    Spectra <- list("Spectra.positive" = spectra.pos,
                    "Spectra.negative" = spectra.neg)
  }
  
  database.info <- list(
    "Version" = version,
    "Source" = source,
    "Link" = link,
    "Creater" = creater,
    "Email" = email,
    "RT" = rt
  )
  
  spectra.info <- as.data.frame(metabolite.info)
  rm(list = "metabolite.info")
  
  msDatabase0.0.1 <- new(
    Class = "databaseClass",
    database.info = database.info,
    spectra.info = spectra.info,
    spectra.data = Spectra
  )
  
  msDatabase0.0.1@database.info$RT <-
    ifelse(all(is.na(msDatabase0.0.1@spectra.info$RT)), FALSE, TRUE)
  cat(crayon::bgRed("All done!\n"))
  return(msDatabase0.0.1)
}
jaspershen/metID documentation built on July 31, 2022, 11:31 p.m.