R/read_write_mgf.R

Defines functions read_mgf_gnps

Documented in read_mgf_gnps

#' @title read_mgf_gnps
#' @description Read MGF data (from GNPS).
#' \lifecycle{experimental}
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param file The vector of names of ms2 files. MS2 file must be mgf.
#' @param threads threads number.
#' @importFrom future plan
#' @importFrom furrr future_map
#' @return Return ms2 data. This is a list.
#' @export

# sxtTools::setwd_project()
# setwd("other_files/all_ms2_database/gnps/")
# x = read_mgf_gnps(file = "HMDB.mgf")

read_mgf_gnps = function(file,
                         threads = 3) {
  # pbapply::pboptions(style = 1)
  cat(crayon::green("Reading mgf data from GNPS...\n"))
  
  future::plan(strategy = future::multisession, workers = threads)
  
  ms2 <- furrr::future_map(
    .x = file,
    .f = function(temp.mgf.data) {
      mgf.data <- readr::read_lines(temp.mgf.data)
      mgf.data = mgf.data[mgf.data != ""]
      mgf.data = mgf.data[mgf.data != " "]
      
      begin_idx = which(mgf.data == "BEGIN IONS")
      end_idx = which(mgf.data == "END IONS")
      
      mgf.data =
        purrr::map2(
          .x = begin_idx,
          .y = end_idx,
          .f = function(idx1, idx2) {
            temp = mgf.data[c(idx1:idx2)]
            temp = temp[temp != "BEGIN IONS"]
            temp = temp[temp != "END IONS"]
            info = temp[grep(":", temp)]
            
            info = stringr::str_split(info, ":", n = 2) %>%
              do.call(rbind, .) %>%
              as.data.frame() %>%
              dplyr::rename(info = V1, value = V2)
            
            spec = temp[-grep(":", temp)]
            spec = stringr::str_split(spec, "\t") %>%
              do.call(rbind, .) %>%
              as.data.frame() %>%
              dplyr::rename(mz = V1, intensity = V2) %>%
              dplyr::mutate(mz = as.numeric(mz),
                            intensity = as.numeric(intensity)) %>%
              as.data.frame()
            list(info = info, spec = spec)
          }
        )
    }, .progress = TRUE
  )
  
  ms2 = Reduce(`c`, ms2)
  cat(crayon::green("Done.\n"))
  ms2
}


#' @title read_mgf_mona
#' @description Read MGF data (from mona).
#' \lifecycle{experimental}
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param file The vector of names of ms2 files. MS2 file must be mgf.
#' @param threads threads number.
#' @return Return ms2 data. This is a list.
#' @export

# sxtTools::setwd_project()
# setwd("other_files/all_ms2_database/mona/2021_6_10/")
# x = read_mgf_mona(file = "MoNA-export-LC-MS-MS_Spectra.mgf")

read_mgf_mona = function(file,
                         threads = 3) {
  # pbapply::pboptions(style = 1)
  cat(crayon::green("Reading mgf data from MoNA...\n"))
  future::plan(strategy = future::multisession, workers = threads)
  ms2 <- furrr::future_map(
    .x = file,
    .f = function(temp.mgf.data) {
      mgf.data <- readr::read_lines(temp.mgf.data)
      mgf.data = mgf.data[mgf.data != ""]
      mgf.data = mgf.data[mgf.data != " "]
      
      begin_idx = which(mgf.data == "BEGIN IONS")
      end_idx = which(mgf.data == "END IONS")
      
      mgf.data =
        purrr::map2(
          .x = begin_idx,
          .y = end_idx,
          .f = function(idx1, idx2) {
            temp = mgf.data[c(idx1:idx2)]
            temp = temp[temp != "BEGIN IONS"]
            temp = temp[temp != "END IONS"]
            info = temp[grep(":", temp)]
            
            info = stringr::str_split(info, ":", n = 2) %>%
              do.call(rbind, .) %>%
              as.data.frame() %>%
              dplyr::rename(info = V1, value = V2)
            
            spec = temp[-grep(":", temp)]
            spec = stringr::str_split(spec, " ") %>%
              do.call(rbind, .) %>%
              as.data.frame() %>%
              dplyr::rename(mz = V1, intensity = V2) %>%
              dplyr::mutate(mz = as.numeric(mz),
                            intensity = as.numeric(intensity)) %>%
              as.data.frame()
            list(info = info, spec = spec)
          }
        )
      
    }, .progress = TRUE
  )
  
  ms2 = Reduce(`c`, ms2)
  cat(crayon::green("Done.\n"))
  ms2
}



#' @title read_mgf_experiment
#' @description Read MGF data from experiment.
#' \lifecycle{experimental}
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param file The vector of names of ms2 files. MS2 file must be mgf.
#' @param threads threads number.
#' @importFrom magrittr %>%
#' @importFrom furrr future_map
#' @return Return ms2 data. This is a list.
#' @export

# sxtTools::setwd_project()
# setwd("example/")
# file = c("QC1_MSMS_NCE25.mgf", "QC1_MSMS_NCE25.mgf")

read_mgf_experiment = function(file,
                               threads = 3) {
  cat(crayon::green("Reading mgf data...\n"))
  future::plan(strategy = future::multisession, workers = threads)
  ms2 <- furrr::future_map(
    .x = file,
    .f = function(temp.mgf.data) {
      mgf.data <- readr::read_lines(temp.mgf.data)
      mgf.data = mgf.data[mgf.data != ""]
      mgf.data = mgf.data[mgf.data != " "]
      
      begin_idx = which(mgf.data == "BEGIN IONS")
      end_idx = which(mgf.data == "END IONS")
      
      mgf.data =
        purrr::map2(
          .x = begin_idx,
          .y = end_idx,
          .f = function(idx1, idx2) {
            temp = mgf.data[c(idx1:idx2)]
            temp = temp[temp != "BEGIN IONS"]
            temp = temp[temp != "END IONS"]
            info = grep("=", temp, value = TRUE)
            
            info = stringr::str_split(info, "=", n = 2) %>%
              do.call(rbind, .) %>%
              as.data.frame() %>%
              dplyr::rename(info = V1, value = V2)
            
            spec = temp[-grep("=", temp, value = FALSE)]
            spec = stringr::str_split(spec, " ") %>%
              do.call(rbind, .) %>%
              as.data.frame() %>%
              dplyr::rename(mz = V1, intensity = V2) %>%
              dplyr::mutate(mz = as.numeric(mz),
                            intensity = as.numeric(intensity)) %>%
              as.data.frame()
            list(info = info, spec = spec)
          }
        )
    }, .progress = TRUE
  )
  
  ms2 = Reduce(`c`, ms2)
  cat(crayon::green("Done.\n"))
  ms2
}


#' @title read_mgf
#' @description Read MGF data.
#' \lifecycle{experimental}
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param file The vector of names of ms2 files. MS2 file must be mgf.
#' @return Return ms2 data. This is a list.
#' @export

read_mgf = function(file) {
  pbapply::pboptions(style = 1)
  cat(crayon::green("Reading mgf data...\n"))
  # mgf.data.list <- pbapply::pblapply(file, ListMGF)
  ms2 <- purrr::map(
    .x = file,
    .f = function(mgf.data) {
      mgf.data <- ListMGF(mgf.data)
      # nl.spec <- grep('^\\d', mgf.data)
      nl.spec <-
        lapply(mgf.data, function(x)
          grep('^\\d', x))
      info.mz <-
        lapply(mgf.data, function(x)
          grep('^PEPMASS', x, value = T))
      info.rt <-
        lapply(mgf.data, function(x)
          grep('^RTINSECONDS', x, value = T))
      
      info.mz <- unlist(info.mz)
      #for orbitrap data, the intensity of precursor ion should be removed
      info.mz <-
        unlist(lapply(strsplit(x = info.mz, split = " "), function(x)
          x[1]))
      info.mz <-
        as.numeric(gsub(pattern = "\\w+=", "", info.mz))
      info.rt <- unlist(info.rt)
      info.rt <-
        as.numeric(gsub(pattern = "\\w+=", "", info.rt))
      
      if (length(mgf.data) == 1) {
        spec <- mapply(function(x, y) {
          temp <- do.call(rbind, strsplit(x[y], split = " "))
          list(temp)
        },
        x = mgf.data,
        y = nl.spec)
      } else{
        spec <- mapply(function(x, y) {
          do.call(rbind, strsplit(x[y], split = " "))
        },
        x = mgf.data,
        y = nl.spec)
      }
      
      spec <- lapply(spec, function(x) {
        temp <- cbind(as.numeric(x[, 1]), as.numeric(x[, 2]))
        temp <- matrix(temp, ncol = 2)
        # if(nrow(temp) > 0) temp <- temp[temp[,2] >= max(temp[,2])*0.01,]
        temp <- matrix(temp, ncol = 2)
        colnames(temp) <- c("mz", "intensity")
        temp
      })
      
      ms2 <- mapply(function(x, y, z) {
        info <- c(y, z)
        names(info) <- c("mz", "rt")
        spectrum <- as.matrix(x)
        temp <- list(info, spectrum)
        names(temp) <- c("info", "spec")
        list(temp)
      },
      x = spec,
      y = info.mz,
      z = info.rt)
      
      ms2
      
    }
  )
  
  spec.info <- ms2[[1]]
  if (length(ms2) > 1) {
    for (i in 2:length(ms2)) {
      spec.info <- c(spec.info, ms2[[i]])
    }
  }
  
  remove.idx <-
    which(unlist(lapply(spec.info, function(x)
      nrow(x[[2]]))) == 0)
  if (length(remove.idx) != 0)
    spec.info <- spec.info[-remove.idx]
  # ##remove noise
  # cat("\n")
  # cat("Remove noise of MS/MS spectra...\n")
  # spec.info <- pbapply::pblapply(spec.info, function(x){
  #   temp.spec <- x[[2]]
  #   temp.spec <- removeNoise(temp.spec)
  #   x[[2]] <- temp.spec
  #   x
  # })
  
  spec.info <- spec.info
}

ListMGF = function(file) {
  mgf.data <- readLines(file)
  nl.rec.new <- 1
  idx.rec <- 1
  rec.list <- list()
  for (nl in seq_along(mgf.data))
  {
    if (mgf.data[nl] == "END IONS")
    {
      rec.list[idx.rec] <- list(Compound = mgf.data[nl.rec.new:nl])
      nl.rec.new <- nl + 1
      idx.rec <- idx.rec + 1
    }
  }
  rec.list
}















#######------------------------------------------------------------------------
#' @title readMGF
#' @description Read MGF data.
#' \lifecycle{experimental}
#' @author Xiaotao Shen
#' \email{shenxt1990@@163.com}
#' @param file The vector of names of ms2 files. MS2 file must be mgf.
#' @return Return ms2 data. This is a list.
#' @export

readMGF = function(file) {
  cat(crayon::yellow("`readMGF()` is deprecated, use `read_mgf()`.\n"))
  pbapply::pboptions(style = 1)
  cat(crayon::green("Reading MS2 data...\n"))
  # mgf.data.list <- pbapply::pblapply(file, ListMGF)
  ms2 <- pbapply::pblapply(file, function(mgf.data) {
    mgf.data <- ListMGF(mgf.data)
    # nl.spec <- grep('^\\d', mgf.data)
    nl.spec <-
      lapply(mgf.data, function(x)
        grep('^\\d', x))
    info.mz <-
      lapply(mgf.data, function(x)
        grep('^PEPMASS', x, value = T))
    info.rt <-
      lapply(mgf.data, function(x)
        grep('^RTINSECONDS', x, value = T))
    
    info.mz <- unlist(info.mz)
    #for orbitrap data, the intensity of precursor ion should be removed
    info.mz <-
      unlist(lapply(strsplit(x = info.mz, split = " "), function(x)
        x[1]))
    info.mz <-
      as.numeric(gsub(pattern = "\\w+=", "", info.mz))
    info.rt <- unlist(info.rt)
    info.rt <-
      as.numeric(gsub(pattern = "\\w+=", "", info.rt))
    
    if (length(mgf.data) == 1) {
      spec <- mapply(function(x, y) {
        temp <- do.call(rbind, strsplit(x[y], split = " "))
        list(temp)
      },
      x = mgf.data,
      y = nl.spec)
    } else{
      spec <- mapply(function(x, y) {
        do.call(rbind, strsplit(x[y], split = " "))
      },
      x = mgf.data,
      y = nl.spec)
    }
    
    spec <- lapply(spec, function(x) {
      temp <- cbind(as.numeric(x[, 1]), as.numeric(x[, 2]))
      temp <- matrix(temp, ncol = 2)
      # if(nrow(temp) > 0) temp <- temp[temp[,2] >= max(temp[,2])*0.01,]
      temp <- matrix(temp, ncol = 2)
      colnames(temp) <- c("mz", "intensity")
      temp
    })
    
    ms2 <- mapply(function(x, y, z) {
      info <- c(y, z)
      names(info) <- c("mz", "rt")
      spectrum <- as.matrix(x)
      temp <- list(info, spectrum)
      names(temp) <- c("info", "spec")
      list(temp)
    },
    x = spec,
    y = info.mz,
    z = info.rt)
    
    ms2
    
  })
  
  
  spec.info <- ms2[[1]]
  if (length(ms2) > 1) {
    for (i in 2:length(ms2)) {
      spec.info <- c(spec.info, ms2[[i]])
    }
  }
  
  remove.idx <-
    which(unlist(lapply(spec.info, function(x)
      nrow(x[[2]]))) == 0)
  if (length(remove.idx) != 0)
    spec.info <- spec.info[-remove.idx]
  # ##remove noise
  # cat("\n")
  # cat("Remove noise of MS/MS spectra...\n")
  # spec.info <- pbapply::pblapply(spec.info, function(x){
  #   temp.spec <- x[[2]]
  #   temp.spec <- removeNoise(temp.spec)
  #   x[[2]] <- temp.spec
  #   x
  # })
  
  spec.info <- spec.info
}
jaspershen/metID documentation built on July 31, 2022, 11:31 p.m.