#' @title Get MS2 spectra of peaks from databaseClass object
#' @description Get MS2 spectra of peaks from databaseClass object.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param lab.id The lab ID of metabolite.
#' @param database Database (databaseClass object).
#' @param polarity positive or negative.
#' @param ce Collision value.
#' @importFrom crayon red yellow green bgRed
#' @importFrom stringr str_detect str_extract
#' @importFrom readr cols
#' @importFrom pbapply pblapply
#' @return A MS2 spectrum (data.frame).
#' @export
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
getMS2spectrum <-
function(lab.id,
database,
polarity = c("positive", "negative"),
ce = "30") {
message("`getMS2spectrum()` is deprecated, use `get_ms2_spectrum()`.")
polarity <- match.arg(polarity)
if (!is(database, "databaseClass")) {
stop("The database must be databaseClass object.\n")
}
pol <- ifelse(polarity == "positive", 1, 2)
temp <-
database@spectra.data[[pol]][[match(lab.id, names(database@spectra.data[[pol]]))]]
temp[[match(ce, names(temp))]]
}
#' @title Generate the mzIdentify parameter list
#' @description Generate the mzIdentify parameter list.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ms1.match.ppm Precursor match ppm tolerance.
#' @param polarity The polarity of data, "positive"or "negative".
#' @param column "hilic" (HILIC column) or "rp" (reverse phase).
#' @param candidate.num The number of candidates.
#' @param database MS1 database name.
#' @param threads Number of threads
#' @return A mzIdentifyClass object.
#' @export
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
mzIdentifyParam <-
function(ms1.match.ppm = 25,
polarity = c("positive", "negative"),
column = c("rp", "hilic"),
candidate.num = 1,
database,
threads = 3) {
message("`mzIdentifyParam()` is deprecated.")
if (missing(database)) {
stop("The database name must be provided.\n")
}
if (database != "HMDB.metabolite.data") {
stop("database can only be HMDB.metabolite.data\n")
}
polarity <- match.arg(polarity)
column <- match.arg(column)
param <- list(
ms1.match.ppm = ms1.match.ppm,
polarity = polarity,
column = column,
candidate.num = candidate.num,
database = database,
threads = threads
)
list("mzIdentifyParam" = param)
}
#' @title Show the base information of metid pacakge
#' @description Show the base information of metid pacakge.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @return A ASCII log of metid
#' @export
#' @examples
#' metid()
metid <-
function() {
message("`metid()` is deprecated, please use `metid_logo()`.")
cat(crayon::green(
c(
" _ _____ ___ ",
" _ __ ___ ___| |_ \\_ \\/ \\",
"| '_ ` _ \\ / _ \\ __| / /\\/ /\\ /",
"| | | | | | __/ |_/\\/ /_/ /_// ",
"|_| |_| |_|\\___|\\__\\____/___,' ",
" "
)
), sep = "\n")
}
# library(cowsay)
# # https://onlineasciitools.com/convert-text-to-ascii-art
# # writeLines(capture.output(say("Hello"), type = "message"), con = "ascii_art.txt")
# art <- readLines("logo.txt")
# dput(art)
# metid_logo <-
# c(" _ _____ ___ ", " _ __ ___ ___| |_ \\_ \\/ \\",
# "| '_ ` _ \\ / _ \\ __| / /\\/ /\\ /", "| | | | | | __/ |_/\\/ /_/ /_// ",
# "|_| |_| |_|\\___|\\__\\____/___,' ", " "
# )
# cat(metid_logo, sep = "\n")
#' @title Identify peaks based on MS1 database
#' @description Identify peaks based on MS1 database.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ms1.data The name of ms1 peak table (csv format). Column 1 is "name", column 2 is
#' "mz" and column 3 is "rt" (retention time, second).
#' @param ms1.match.ppm Precursor match ppm tolerance.
#' @param rt.match.tol RT match tolerance.
#' @param polarity The polarity of data, "positive"or "negative".
#' @param column "hilic" (HILIC column) or "rp" (reverse phase).
#' @param path Work directory.
#' @param candidate.num The number of candidates.
#' @param database MS1 database name or MS1 database.
#' @param threads Number of threads
#' @param silence.deprecated Silenc the deprecated information or not.
#' @return A mzIdentifyClass or metIdentifyClass object.
#' @importFrom magrittr %>%
#' @importFrom dplyr pull filter
#' @export
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
mzIdentify <-
function(ms1.data,
##csv format
ms1.match.ppm = 25,
rt.match.tol = 30,
polarity = c("positive", "negative"),
column = c("hilic", "rp"),
path = ".",
candidate.num = 3,
database,
threads = 3,
silence.deprecated = FALSE) {
if (!silence.deprecated) {
message("`mzIdentify()` is deprecated.")
}
options(warn = -1)
###Check data
if (missing(database)) {
stop("No database is provided.\n")
}
##parameter specification
polarity <- match.arg(polarity)
column <- match.arg(column)
##check ms1.file and ms2.file
file <- dir(path)
if (!all(ms1.data %in% file)) {
stop("MS1 data is not in the directory, please check it.\n")
}
if (!is(database, "databaseClass")) {
if (!all(database %in% file)) {
stop("Database is not in this directory, please check it.\n")
}
}
#load database
if (!is(database, "databaseClass")) {
database.name <- database
load(file.path(path, database.name))
database <- get(database.name)
} else{
database.name = paste(database@database.info$Source,
database@database.info$Version,
sep = "_")
}
#------------------------------------------------------------------
##load adduct table
if (polarity == "positive" & column == "hilic") {
data("hilic.pos", envir = environment())
adduct.table <- hilic.pos
}
if (polarity == "positive" & column == "rp") {
data("rp.pos", envir = environment())
adduct.table <- rp.pos
}
if (polarity == "negative" & column == "hilic") {
data("hilic.neg", envir = environment())
adduct.table <- hilic.neg
}
if (polarity == "negative" & column == "rp") {
data("rp.neg", envir = environment())
adduct.table <- rp.neg
}
ms1.data <-
readr::read_csv(file = file.path(path, ms1.data),
col_types = readr::cols())
colnames(ms1.data)[seq_len(3)] <- c("name", "mz", "rt")
if (rt.match.tol > 10000) {
message(crayon::yellow(
"You set rt.match.tol > 10,000, so RT will not be used for matching."
))
} else{
message(
crayon::yellow(
"You set rt.match.tol < 10,000, so if the compounds have RT, RTs will be used for matching."
)
)
}
temp.fun <- function(idx,
ms1.data,
ms1.match.ppm = 25,
rt.match.tol = 30,
database,
adduct.table,
candidate.num = 3) {
temp.mz <-
as.numeric(dplyr::pull(.data = ms1.data[idx, ], var = "mz"))
temp.rt <-
as.numeric(dplyr::pull(.data = ms1.data[idx, ], var = "rt"))
rm(list = c("ms1.data"))
if (is(database, "databaseClass")) {
database <- database@spectra.info
}
temp.mz.diff1 <- abs(temp.mz - as.numeric(database$mz))
temp.mz.diff2 <- abs(temp.mz - as.numeric(database$mz) * 2)
temp.mz.diff3 <- abs(temp.mz - as.numeric(database$mz) * 3)
temp.mz.diff1[is.na(temp.mz.diff1)] <- 100000
temp.mz.diff2[is.na(temp.mz.diff2)] <- 100000
temp.mz.diff3[is.na(temp.mz.diff3)] <- 100000
database <-
database[which(
temp.mz.diff1 < max(abs(range(
adduct.table$mz
))) + 1 |
temp.mz.diff2 < max(abs(range(
adduct.table$mz
))) + 1 |
temp.mz.diff3 < max(abs(range(
adduct.table$mz
))) + 1
)
, , drop = FALSE]
rm(list = c("temp.mz.diff1", "temp.mz.diff2", "temp.mz.diff3"))
if (nrow(database) == 0)
return(NA)
spectra.mz <- purrr::map(as.data.frame(t(adduct.table)), function(x) {
temp.n <-
stringr::str_extract(string = as.character(x[1]), pattern = "[0-9]{1}M")
temp.n <-
as.numeric(stringr::str_replace(
string = temp.n,
pattern = "M",
replacement = ""
))
temp.n[is.na(temp.n)] <- 1
as.numeric(x[2]) + temp.n * as.numeric(database$mz)
}) %>%
do.call(cbind, .)
# spectra.mz <- apply(adduct.table, 1, function(x) {
# temp.n <-
# stringr::str_extract(string = as.character(x[1]), pattern = "[0-9]{1}M")
# temp.n <-
# as.numeric(stringr::str_replace(
# string = temp.n,
# pattern = "M",
# replacement = ""
# ))
# temp.n[is.na(temp.n)] <- 1
# as.numeric(x[2]) + temp.n * as.numeric(database$mz)
# })
colnames(spectra.mz) <- adduct.table[, 1]
rownames(spectra.mz) <- database$Lab.ID
###mz match
temp <-
abs(spectra.mz - temp.mz) * 10 ^ 6 / ifelse(temp.mz < 400, 400, temp.mz)
temp.idx <-
which(temp < ms1.match.ppm, arr.ind = TRUE)
if (nrow(temp.idx) == 0)
return(NA)
match.idx <- apply(temp.idx, 1, function(x) {
data.frame(
"Lab.ID" = rownames(spectra.mz)[x[1]],
"Addcut" = colnames(spectra.mz)[x[2]],
"mz.error" = temp[x[1], x[2]],
stringsAsFactors = FALSE
)
})
rm(list = c("spectra.mz", "adduct.table", "temp", "temp.idx"))
##remove some none matched
match.idx <-
match.idx[which(unlist(lapply(match.idx, function(x) {
nrow(x)
})) != 0)]
if (length(match.idx) == 0) {
return(NA)
}
match.idx <- do.call(rbind, match.idx)
# match.idx <- data.frame(rownames(match.idx), match.idx, stringsAsFactors = FALSE)
colnames(match.idx) <-
c("Lab.ID", "Adduct", "mz.error")
match.idx <-
match.idx[order(match.idx$mz.error, decreasing = FALSE), , drop = FALSE]
##rt match
RT.error <-
abs(temp.rt - as.numeric(database$RT)[match(match.idx$Lab.ID, database$Lab.ID)])
match.idx <- data.frame(match.idx, RT.error, stringsAsFactors = FALSE)
match.idx <-
dplyr::filter(match.idx, is.na(RT.error) |
RT.error < rt.match.tol)
if (nrow(match.idx) == 0) {
return(NA)
}
if (nrow(match.idx) > candidate.num) {
match.idx <- match.idx[seq_len(candidate.num), , drop = FALSE]
}
match.idx <- data.frame(match.idx, database[match(match.idx$Lab.ID, database$Lab.ID), c("Compound.name", "CAS.ID", "HMDB.ID", "KEGG.ID"), drop = FALSE], stringsAsFactors = FALSE)
match.idx <-
match.idx[, c(
"Compound.name",
"CAS.ID",
"HMDB.ID",
"KEGG.ID",
"Lab.ID",
"Adduct",
"mz.error",
'RT.error'
)]
rownames(match.idx) <- NULL
return(match.idx)
}
if (masstools::get_os() == "windows") {
bpparam = BiocParallel::SnowParam(workers = threads, progressbar = TRUE)
} else{
bpparam = BiocParallel::MulticoreParam(workers = threads, progressbar = TRUE)
}
match.result <-
BiocParallel::bplapply(
seq_len(nrow(ms1.data)),
FUN = temp.fun,
BPPARAM = bpparam,
ms1.data = ms1.data,
ms1.match.ppm = ms1.match.ppm,
rt.match.tol = rt.match.tol,
database = database,
adduct.table = adduct.table,
candidate.num = candidate.num
)
names(match.result) <- ms1.data$name
temp.idx <-
which(unlist(lapply(match.result, function(x) {
all(is.na(x))
})))
if (length(temp.idx) > 0) {
match.result <- match.result[-temp.idx]
}
if (is(database, "databaseClass")) {
return.result <- new(
Class = "metIdentifyClass",
ms1.data = ms1.data,
# ms1.info = ms1.info,
# ms2.info = ms2.info,
identification.result = match.result,
# match.result = match.result,
adduct.table = adduct.table,
# ms1.ms2.match.mz.tol = ms1.ms2.match.mz.tol,
# ms1.ms2.match.rt.tol = ms1.ms2.match.rt.tol,
ms1.match.ppm = ms1.match.ppm,
# ms2.match.ppm = ms2.match.ppm,
# ms2.match.tol = ms2.match.tol,
rt.match.tol = rt.match.tol,
polarity = polarity,
# ce = paste(ce, collapse = ";"),
column = column,
# ms1.match.weight = ms1.match.weight,
# rt.match.weight = rt.match.weight,
# ms2.match.weight = ms2.match.weight,
path = path,
# total.score.tol = total.score.tol,
candidate.num = candidate.num,
database = database.name,
threads = threads,
version = "1.0.0"
)
return.result@identification.result <-
lapply(return.result@identification.result, function(x) {
if (is.null(x)) {
return(x)
} else{
x$mz.match.score <-
exp(-0.5 * (x$mz.error / (ms1.match.ppm)) ^ 2)
if (rt.match.tol > 10000) {
# cat(crayon::yellow("You set rt.match.tol > 10,000, so RT will not be used for matching.\n"))
x$RT.error <- NA
x$RT.match.score <- NA
x$Total.score <- x$mz.match.score
} else{
x$RT.match.score <-
exp(-0.5 * (x$RT.error / (rt.match.tol)) ^ 2)
x$Total.score <- x$mz.match.score * 0.5 +
x$RT.match.score * 0.5
x$Total.score[is.na(x$Total.score)] <-
x$mz.match.score[is.na(x$Total.score)]
}
x$CE <- NA
x$SS <- 0
return(x)
}
})
} else{
return.result <- new(
Class = "mzIdentifyClass",
ms1.data = ms1.data,
identification.result = match.result,
adduct.table = adduct.table,
ms1.match.ppm = ms1.match.ppm,
polarity = polarity,
column = column,
path = path,
candidate.num = candidate.num,
database = database.name,
threads = threads
)
}
message(crayon::bgRed("All done."))
return(return.result)
}
##------------------------------------------------------------------------------
#' @title get_identification_table_all
#' @description Extract the identifications from multiple results of `identify_metabolite_all()`.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ... One or multiple results from `identify_metabolite_all()`.
#' @param candidate.num candidate.num
#' @param level_condition Condition for level assign.
#' @return A identification table (data.frame).
#' @export
#' @importFrom tibble as_tibble rownames_to_column
#' @importFrom magrittr %>%
#' @importFrom stringr str_split
#' @importFrom purrr map map2
#' @importFrom crayon green red yellow
#' @importFrom dplyr select mutate everything filter left_join
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
get_identification_table_all <-
function(...,
candidate.num = 1,
level_condition = c(
"mz&rt&ms2" = 1,
"mz&rt" = 2,
"mz&ms2" = 2,
"mz" = 3
)) {
message("This function is deprecated")
options(warn = -1)
result = list(...)
##rename result
result =
purrr::map2(
.x = result,
.y = seq_along(result),
.f = function(x, y) {
names(x) = paste(y, names(x), sep = "_")
x
}
)
result = unlist(result)
database_level =
result %>%
purrr::map(function(y) {
database = y@database
mz_match = ifelse(is.na(y@identification.result[[1]]$mz.error[1]), "", "mz")
rt_match = ifelse(is.na(y@identification.result[[1]]$RT.error[1]), "", "rt")
ms2_match = ifelse(all(y@identification.result[[1]]$SS == 0), "", "ms2")
final = paste(mz_match, rt_match, ms2_match, sep = "&")
final = stringr::str_split(string = final, pattern = "&")[[1]]
final = paste(final[final != ""], collapse = "&")
level = level_condition[match(final, names(level_condition))]
c(database, level)
}) %>%
do.call(rbind, .) %>%
as.data.frame()
colnames(database_level)[seq_len(2)] = c("database", "level")
database_level =
database_level %>%
tibble::rownames_to_column(var = "name")
database_level$level[is.na(database_level$level)] = 5
####annotation table level 1
if (any(database_level$level == 1)) {
message(crayon::green("Level 1 table..."))
annotation_table1 <-
get_identification_table(result[which(database_level$level == 1)], type = "new", candidate.num = candidate.num)
annotation_table1 <-
data.frame(annotation_table1,
Level = 1,
stringsAsFactors = FALSE)
message(crayon::red("OK"))
} else{
message(crayon::yellow("No level 1."))
annotation_table1 = NULL
}
####annotation table level 2
if (any(database_level$level == 2)) {
message(crayon::green("Level 2 table..."))
annotation_table2 <-
get_identification_table(result[which(database_level$level == 2)], type = "new", candidate.num = candidate.num)
annotation_table2 <-
data.frame(annotation_table2,
Level = 2,
stringsAsFactors = FALSE)
message(crayon::red("OK"))
} else{
message(crayon::yellow("No level 2."))
annotation_table2 = NULL
}
####annotation table level 3
if (any(database_level$level == 3)) {
message(crayon::green("Level 3 table..."))
annotation_table3 <-
get_identification_table(result[which(database_level$level == 3)], type = "new", candidate.num = candidate.num)
annotation_table3 <-
data.frame(annotation_table3,
Level = 3,
stringsAsFactors = FALSE)
message(crayon::red("OK"))
} else{
message(crayon::yellow("No level 3."))
annotation_table3 = NULL
}
###remove the redundant result
if (!is.null(annotation_table1)) {
annotation_table1 =
annotation_table1 %>%
dplyr::filter(!is.na(Compound.name))
}
if (!is.null(annotation_table2)) {
annotation_table2 =
annotation_table2 %>%
dplyr::filter(!is.na(Compound.name))
if (!is.null(annotation_table1)) {
annotation_table2 =
annotation_table2 %>%
dplyr::filter(!name %in% annotation_table1$name)
}
}
if (!is.null(annotation_table3)) {
annotation_table3 =
annotation_table3 %>%
dplyr::filter(!is.na(Compound.name))
if (!is.null(annotation_table1)) {
annotation_table3 =
annotation_table3 %>%
dplyr::filter(!name %in% annotation_table1$name)
}
if (!is.null(annotation_table2)) {
annotation_table3 =
annotation_table3 %>%
dplyr::filter(!name %in% annotation_table2$name)
}
}
annotation_table =
rbind(annotation_table1, annotation_table2, annotation_table3)
ms1_peak = result[[1]]@ms1.data
diff_name =
setdiff(ms1_peak$name, annotation_table$name)
if (length(diff_name) > 0) {
new_matrix =
ms1_peak[match(diff_name, ms1_peak$name), ]
new_matrix =
new_matrix %>%
dplyr::mutate(
MS2.spectra.name = NA,
Candidate.number = NA,
Compound.name = NA,
CAS.ID = NA,
HMDB.ID = NA,
KEGG.ID = NA,
Lab.ID = NA,
Adduct = NA,
mz.error = NA,
mz.match.score = NA,
RT.error = NA,
RT.match.score = NA,
CE = NA,
SS = NA,
Total.score = NA,
Database = NA,
Level = NA
)
annotation_table =
rbind(annotation_table, new_matrix) %>%
as.data.frame()
}
ms1_peak_name = colnames(ms1_peak)
ms1_peak_name = ms1_peak_name[ms1_peak_name != "name"]
annotation_table =
annotation_table %>%
dplyr::select(-ms1_peak_name) %>%
dplyr::left_join(ms1_peak, by = "name") %>%
dplyr::select(name, ms1_peak_name, dplyr::everything())
if (candidate.num == 1) {
annotation_table =
annotation_table[match(result[[1]]@ms1.data$name, annotation_table$name), ]
}
message(crayon::red("All done."))
return(tibble::as_tibble(annotation_table))
}
##------------------------------------------------------------------------------
#' @title Get identification table from a metIdentifyClass object
#' @description Get identification table from a metIdentifyClass object.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ... One or multiple metIdentifyClass objects.
#' @param candidate.num The number of candidates.
#' @param type The type of identification table.
#' @return A identification table (data.frame).
#' @export
#' @importFrom tibble as_tibble
#' @importFrom magrittr %>%
#' @importFrom dplyr select mutate everything lag filter
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
get_identification_table <-
function(...,
candidate.num = 3,
type = c("old", "new")) {
message("This function is deprecated")
options(warn = -1)
candidate.num <- round(candidate.num)
if (candidate.num <= 0) {
candidate.num <- 1
}
object <- list(...)
###if object is a one list or not
if (length(object) == 1 & is.list(object[[1]])) {
object = object[[1]]
}
if (any(unique(unlist(lapply(object, class))) != "metIdentifyClass")) {
stop("Only for metIdentifyClass\n")
}
type <- match.arg(type)
##if object number is 1
if (length(object) == 1) {
object <- object[[1]]
database <- object@database
identification.result <- object@identification.result
if (is.null(identification.result[[1]])) {
return(NULL)
}
if (nrow(object@match.result) == 0) {
message(crayon::yellow("The object is identified without MS2 spectra."))
return(
getIdentificationTable2(
object = object,
candidate.num = candidate.num,
type = type,
silence.deprecated = TRUE
)
)
}
##add database information
identification.result <-
lapply(identification.result, function(x) {
if (nrow(x) > candidate.num) {
x <- x[seq_len(candidate.num), , drop = FALSE]
}
data.frame(x,
"Database" = object@database,
stringsAsFactors = FALSE)
})
peak.table <- object@ms1.data
match.result <- object@match.result
if (type == "old") {
identification.table <-
as.data.frame(matrix(nrow = nrow(peak.table), ncol = 3))
colnames(identification.table) <-
c("MS2.spectra.name",
"Candidate.number",
"Identification")
identification.table[match.result[, 1], 1] <-
object@ms1.info$name[match.result[, 2]]
item <- colnames(identification.result[[1]])
identification.result <-
lapply(identification.result, function(x) {
paste(apply(x, 1, function(y) {
paste(paste(item, as.character(y), sep = ":"), collapse = ";")
}), collapse = "{}")
})
identification.table$Identification[match(names(identification.result),
identification.table$MS2.spectra.name)] <-
unlist(identification.result)
identification.table$Candidate.number <-
lapply(identification.table$Identification, function(x) {
if (is.na(x))
return(0)
return(length(stringr::str_split(
string = x, pattern = "\\{\\}"
)[[1]]))
}) %>%
unlist()
identification.table <-
data.frame(peak.table, identification.table, stringsAsFactors = FALSE)
} else {
identification.table <-
vector(mode = "list", length = nrow(peak.table))
names(identification.table)[match.result[, 1]] <-
object@ms1.info$name[match.result[, 2]]
identification.table[match(names(identification.result),
names(identification.table))] <-
identification.result
peak.table <- apply(peak.table, 1, list)
peak.table <- lapply(peak.table, unlist)
identification.table <-
mapply(
FUN = function(x, y) {
if (all(is.na(y))) {
temp <-
as.data.frame(matrix(c(x, rep(NA, 14)), nrow = 1), stringsAsFactors = FALSE)
colnames(temp) <-
c(
names(x),
c(
"Compound.name",
"CAS.ID",
"HMDB.ID",
"KEGG.ID",
"Lab.ID",
"Adduct",
"mz.error",
"mz.match.score",
"RT.error",
"RT.match.score",
"CE",
"SS",
"Total.score",
"Database"
)
)
list(temp)
} else{
temp <-
as.data.frame(matrix(rep(x, nrow(y)), nrow = nrow(y), byrow = TRUE),
stringsAsFactors = FALSE)
if (nrow(temp) > 1) {
temp[2:nrow(temp), 2:ncol(temp)] <- ""
}
colnames(temp) <- names(x)
temp <-
data.frame(temp, y, stringsAsFactors = FALSE)
list(temp)
}
},
x = peak.table,
y = identification.table
)
identification.table <-
as.data.frame(do.call(rbind, identification.table))
identification.table$mz <-
identification.table$mz %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$rt <-
identification.table$rt %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$mz.error <-
identification.table$mz.error %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$mz.match.score <-
identification.table$mz.match.score %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$RT.error <-
identification.table$RT.error %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$RT.match.score <-
identification.table$RT.match.score %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$SS <-
identification.table$SS %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$Total.score <-
identification.table$Total.score %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
###add Candidate.number
Candidate.number = purrr::map2(
.x = object@identification.result,
.y = names(object@identification.result),
.f = function(x, y) {
data.frame(
MS2.spectra.name = y,
Candidate.number = nrow(x),
stringsAsFactors = FALSE
)
}
) %>%
do.call(rbind, .) %>%
as.data.frame()
Candidate.number =
Candidate.number %>%
dplyr::left_join(object@match.result[, c("MS1.peak.name", "MS2.spectra.name")], by = "MS2.spectra.name") %>%
dplyr::select(MS1.peak.name, dplyr::everything())
identification.table =
identification.table %>%
dplyr::left_join(Candidate.number, by = c("name" = "MS1.peak.name")) %>%
dplyr::select(
colnames(object@ms1.data),
MS2.spectra.name,
Candidate.number,
dplyr::everything()
)
identification.table$MS2.spectra.name[which(is.na(identification.table$Compound.name))] <- NA
identification.table$Candidate.number[which(is.na(identification.table$Compound.name))] <- NA
}
rownames(identification.table) <- NULL
return(tibble::as_tibble(identification.table))
} else{
len <- lapply(object, function(x) {
nrow(x@match.result)
}) %>% unlist()
if (any(len == 0)) {
warning(
crayon::yellow(
"Some objects are identified without MS2 spectra.\nPlease make sure that you want to compare them together.\n"
)
)
}
###if object number is not 1
peak.table <- object[[1]]@ms1.data
database.name <-
unlist(lapply(object, function(x)
x@database))
identification.result <-
lapply(object, function(x) {
if (nrow(x@match.result) != 0) {
iden.result <- x@identification.result
iden.result <- mapply(function(x, y) {
if (nrow(x) > candidate.num) {
x <- x[seq_len(candidate.num), , drop = FALSE]
}
x <- data.frame(x,
"MS2.spectra.name" = y,
stringsAsFactors = FALSE)
list(x)
}, x = iden.result, y = names(iden.result))
match.result <- x@match.result
# names(iden.result) <-
# match.result$MS1.peak.name[match(names(iden.result), match.result$MS2.spectra.name)]
iden.result <-
do.call(rbind, iden.result)
Peak.name <-
match.result$MS1.peak.name[match(iden.result$MS2.spectra.name,
match.result$MS2.spectra.name)]
iden.result <-
data.frame(Peak.name, iden.result, stringsAsFactors = FALSE)
rownames(iden.result) <- NULL
return(iden.result)
} else{
iden.result <- x@identification.result
iden.result <- mapply(function(x, y) {
if (nrow(x) > candidate.num) {
x <- x[seq_len(candidate.num), , drop = FALSE]
}
x <- data.frame(x,
"MS2.spectra.name" = y,
stringsAsFactors = FALSE)
list(x)
}, x = iden.result, y = names(iden.result))
}
iden.result <-
do.call(rbind, iden.result)
iden.result <-
iden.result %>%
dplyr::mutate(Peak.name = MS2.spectra.name, MS2.spectra.name = NA) %>%
dplyr::select(Peak.name, dplyr::everything())
})
identification.result <- mapply(
function(x, y, z) {
list(data.frame(
x,
"Database" = y,
"Dataset.index" = z,
stringsAsFactors = FALSE
))
},
x = identification.result,
y = database.name,
z = seq_along(identification.result)
)
identification.result <-
do.call(rbind, identification.result)
identification.result <-
as.data.frame(identification.result)
identification.result <-
plyr::dlply(
.data = identification.result,
.variables = plyr::.(Peak.name),
.fun = function(x) {
x <- x[order(x$SS, decreasing = TRUE), , drop = FALSE]
if (nrow(x) > candidate.num) {
x <- x[seq_len(candidate.num), , drop = FALSE]
}
x
}
)
# if(type == "old"){
identification.table <-
as.data.frame(matrix(nrow = nrow(peak.table), ncol = 3))
colnames(identification.table) <-
c("MS2.spectra.name", "Candidate.number", "Identification")
# identification.table[match.result[,1],1] <- object@ms1.info$name[match.result[,2]]
item <- colnames(identification.result[[1]])[-1]
identification.result <-
lapply(identification.result, function(x) {
paste(apply(x[, -1], 1, function(y) {
paste(paste(item, as.character(y), sep = ":"), collapse = ";")
}), collapse = "{}")
})
identification.table$Identification[match(names(identification.result), peak.table$name)] <-
unlist(identification.result)
identification.table$Candidate.number <-
lapply(identification.table$Identification, function(x) {
if (is.na(x))
return(0)
return(length(stringr::str_split(
string = x, pattern = "\\{\\}"
)[[1]]))
}) %>%
unlist()
identification.table$MS2.spectra.name <-
unlist(lapply(identification.table$Identification, function(x) {
if (is.na(x))
return(NA)
temp.name <-
stringr::str_extract_all(string = x, pattern = "MS2.spectra.name:mz[0-9\\.\\-\\_]{1,30}rt[0-9\\.\\-\\_]{1,30}")[[1]]
temp.name <-
stringr::str_replace_all(string = temp.name,
pattern = "MS2.spectra.name:",
replacement = "")
paste(temp.name, collapse = ";")
}))
identification.table <-
data.frame(peak.table, identification.table, stringsAsFactors = FALSE)
if (type == "new") {
identification.table <-
trans_to_new_style(identification.table = identification.table)
identification.table$mz <-
identification.table$mz %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$rt <-
identification.table$rt %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$mz.error <-
identification.table$mz.error %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$mz.match.score <-
identification.table$mz.match.score %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$RT.error <-
identification.table$RT.error %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$RT.match.score <-
identification.table$RT.match.score %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$SS <-
identification.table$SS %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
identification.table$Total.score <-
identification.table$Total.score %>%
stringr::str_trim(side = 'both') %>%
as.numeric()
}
return(tibble::as_tibble(identification.table))
}
}
##------------------------------------------------------------------------------
#' @title Transform old style identification table to new style
#' @description Transform old style identification table to new style.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param identification.table Identification table from get_identification_table.
#' @return A identification table (data.frame).
#' @export
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
trans_to_new_style <-
function(identification.table) {
message("This function is deprecated")
if (all(colnames(identification.table) != "Identification")) {
return(identification.table)
} else{
identification <- identification.table$Identification
identification <-
pbapply::pblapply(identification, function(x) {
if (is.na(x)) {
return(
data.frame(
"Compound.name" = NA,
"CAS.ID" = NA,
"HMDB.ID" = NA,
"KEGG.ID" = NA,
"Lab.ID" = NA,
"Adduct" = NA,
"mz.error" = NA,
"mz.match.score" = NA,
"RT.error" = NA,
"RT.match.score" = NA,
"CE" = NA,
"SS" = NA,
"Total.score" = NA,
"Database" = NA,
stringsAsFactors = FALSE
)
)
} else{
x <- stringr::str_split(string = x, pattern = "\\{\\}")[[1]][1]
Compound.name <-
stringr::str_split(string = x,
pattern = "\\;CAS\\.ID",
n = 2)[[1]][1]
# require(magrittr)
Compound.name <- Compound.name %>%
stringr::str_replace(string = .,
pattern = "Compound.name\\:",
replacement = "")
x <-
stringr::str_split(string = x, pattern = ";")[[1]]
x <-
lapply(c(
"CAS.ID\\:",
"HMDB.ID\\:",
"KEGG.ID\\:",
"Lab.ID\\:",
"Adduct\\:",
"mz.error\\:",
"mz.match.score\\:",
"RT.error\\:",
"RT.match.score\\:",
"CE\\:",
"SS\\:",
"Total.score\\:",
"Database\\:"
), function(y) {
y <- grep(y, x, value = TRUE) %>%
stringr::str_replace(
string = .,
pattern = paste(y, "\\:", sep = ""),
replacement = ""
)
if (length(y) == 0) {
return(NA)
}
y
}) %>%
unlist()
x <- stringr::str_replace(
string = x,
pattern = c(
"CAS.ID\\:",
"HMDB.ID\\:",
"KEGG.ID\\:",
"Lab.ID\\:",
"Adduct\\:",
"mz.error\\:",
"mz.match.score\\:",
"RT.error\\:",
"RT.match.score\\:",
"CE\\:",
"SS\\:",
"Total.score\\:",
"Database\\:"
),
replacement = ""
)
x <-
matrix(c(Compound.name, x),
nrow = 1,
byrow = TRUE)
colnames(x) <-
c(
"Compound.name",
"CAS.ID",
"HMDB.ID",
"KEGG.ID",
"Lab.ID",
"Adduct",
"mz.error",
"mz.match.score",
"RT.error",
"RT.match.score",
"CE",
"SS",
"Total.score",
"Database"
)
return(as.data.frame(x, stringsAsFactors = FALSE))
}
})
identification <- do.call(rbind, identification)
identification.table <-
tibble::as_tibble(identification.table)
identification.table <-
dplyr::select(.data = identification.table, -(Identification))
identification.table <-
cbind(identification.table, identification)
invisible(tibble::as_tibble(identification.table))
# identification1 <- dplyr::bind_rows(identification)
}
}
##------------------------------------------------------------------------------
#' @title Get identification table from a mzIdentifyClass object
#' @description Get identification table from a mzIdentifyClass object.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param object mzIdentifyClass object.
#' @param candidate.num The number of candidates.
#' @param type The type of identification table.
#' @param silence.deprecated Silenc the deprecated information or not.
#' @return A identification table (data.frame).
#' @export
#' @importFrom magrittr %>%
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
# object = result_rp_pos25[[8]]
getIdentificationTable2 <-
function(object,
candidate.num = 3,
type = c("old", "new"),
silence.deprecated = FALSE) {
if (!silence.deprecated) {
message("`getIdentificationTable2()` is deprecated, use `get_identification_table()`.")
}
if (!is(object, "mzIdentifyClass") &
!is(object, "metIdentifyClass")) {
stop("Only for mzIdentifyClass or metIdentifyClass\n")
}
type <- match.arg(type)
database <- object@database
identification.result <- object@identification.result
if (is.null(identification.result[[1]])) {
return(NULL)
}
##add database information
identification.result <-
lapply(identification.result, function(x) {
if (nrow(x) > candidate.num) {
x <- x[seq_len(candidate.num), , drop = FALSE]
}
data.frame(x,
"Database" = object@database,
stringsAsFactors = FALSE)
})
peak.table <- object@ms1.data
if (type == "old") {
identification.table <-
as.data.frame(matrix(nrow = nrow(peak.table), ncol = 2))
colnames(identification.table) <-
c("Candidate.number", "Identification")
item <- colnames(identification.result[[1]])
identification.result <-
lapply(identification.result, function(x) {
paste(apply(x, 1, function(y) {
paste(paste(item, as.character(y), sep = ":"), collapse = ";")
}), collapse = "{}")
})
identification.table$Identification[match(names(identification.result), peak.table$name)] <-
unlist(identification.result)
identification.table$Candidate.number <-
lapply(identification.table$Identification, function(x) {
if (is.na(x))
return(0)
return(length(stringr::str_split(
string = x, pattern = "\\{\\}"
)[[1]]))
}) %>%
unlist()
identification.table <-
data.frame(peak.table, identification.table, stringsAsFactors = FALSE)
} else{
identification.table <-
vector(mode = "list", length = nrow(peak.table))
names(identification.table) <- object@ms1.data$name
identification.table[match(names(identification.result),
names(identification.table))] <-
identification.result
peak.table <- apply(peak.table, 1, list)
peak.table <- lapply(peak.table, unlist)
identification.table <-
mapply(
FUN = function(x, y) {
if (all(is.na(y)) | is.null(y)) {
temp <-
as.data.frame(matrix(c(x, rep(NA, 14)), nrow = 1), stringsAsFactors = FALSE)
colnames(temp) <-
c(
names(x),
c(
"Compound.name",
"CAS.ID",
"HMDB.ID",
"KEGG.ID",
"Lab.ID",
"Adduct",
"mz.error",
"mz.match.score",
"RT.error",
"RT.match.score",
"CE",
"SS",
"Total.score",
"Database"
)
)
list(temp)
} else{
temp <-
as.data.frame(matrix(rep(x, nrow(y)), nrow = nrow(y), byrow = TRUE),
stringsAsFactors = FALSE)
if (nrow(temp) > 1) {
temp[2:nrow(temp), 2:ncol(temp)] <- ""
}
colnames(temp) <- names(x)
temp <-
data.frame(temp, y, stringsAsFactors = FALSE)
list(temp)
}
},
x = peak.table,
y = identification.table
)
identification.table <-
as.data.frame(do.call(rbind, identification.table))
###as.numeric for several column
identification.table$mz =
identification.table$mz %>%
stringr::str_trim(side = "both") %>%
as.numeric()
identification.table$rt =
identification.table$rt %>%
stringr::str_trim(side = "both") %>%
as.numeric()
identification.table$mz.error =
identification.table$mz.error %>%
stringr::str_trim(side = "both") %>%
as.numeric()
identification.table$RT.error =
identification.table$RT.error %>%
stringr::str_trim(side = "both") %>%
as.numeric()
identification.table$mz.match.score =
identification.table$mz.match.score %>%
stringr::str_trim(side = "both") %>%
as.numeric()
identification.table$RT.match.score =
identification.table$RT.match.score %>%
stringr::str_trim(side = "both") %>%
as.numeric()
identification.table$Total.score =
identification.table$Total.score %>%
stringr::str_trim(side = "both") %>%
as.numeric()
identification.table$SS =
identification.table$SS %>%
stringr::str_trim(side = "both") %>%
as.numeric()
##add MS2.spectra.name and Candidate.number
identification.table$MS2.spectra.name = NA
identification.table$Candidate.number = NA
identification.table =
identification.table %>%
dplyr::select(
colnames(object@ms1.data),
MS2.spectra.name,
Candidate.number,
Compound.name,
CAS.ID,
HMDB.ID,
KEGG.ID,
Lab.ID,
Adduct,
mz.error,
mz.match.score,
RT.error,
RT.match.score,
CE,
SS,
Total.score,
Database
)
}
rownames(identification.table) <- NULL
return(tibble::as_tibble(identification.table))
}
#' @title Identify metabolites using multiple databases one time
#' @description Identify metabolites using multiple databases one time.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ms1.data The name of ms1 peak table (csv format). Column 1 is "name", column 2 is
#' "mz" and column 3 is "rt" (second).
#' @param ms2.data MS2 data, must be mgf, msp or mzXML format. For example, ms2.data = c("test.mgf", "test2.msp").
#' @param parameter.list A list contains paramters for each processing.
#' The parameter must get using metIdentifyParam or mzIdentifyParam.
#' @param path Work directory.
#' @return A list containing mzIdentifyClass object.
#' @export
#' @importFrom magrittr %>%
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/multiple_databases.html}
identify_metabolite_all <-
function(ms1.data, ms2.data, parameter.list, path = ".") {
message("This function is deprecated")
dir.create(path = path, showWarnings = FALSE)
old.path <- path
path <- file.path(path, "Result")
dir.create(path = path, showWarnings = FALSE)
threads = parameter.list[[1]]$threads
ms1.data.name <- ms1.data
ms2.data.name <- ms2.data
intermediate_path <-
file.path(path, "intermediate_data")
dir.create(intermediate_path, showWarnings = FALSE)
file <- dir(intermediate_path)
if (all(c("ms1.info", "ms2.info") %in% file)) {
message(crayon::yellow("Use old data"))
load(file.path(intermediate_path, "ms1.info"))
load(file.path(intermediate_path, "ms2.info"))
} else{
##read MS2 data
message(crayon::green("Reading MS2 data..."))
temp.ms2.type <-
stringr::str_split(string = ms2.data.name, pattern = "\\.")[[1]]
temp.ms2.type <- temp.ms2.type[length(temp.ms2.type)]
if (temp.ms2.type %in% c("mzXML", "mzML")) {
ms2.data <-
masstools::read_mzxml(file = file.path(old.path, ms2.data.name),
threads = threads)
} else{
ms2.data <- lapply(ms2.data.name, function(temp.ms2.data) {
temp.ms2.type <- stringr::str_split(string = temp.ms2.data, pattern = "\\.")[[1]]
temp.ms2.type <-
temp.ms2.type[length(temp.ms2.type)]
if (!temp.ms2.type %in% c("mgf", "msp"))
stop("We only support mgf or msp.\n")
if (temp.ms2.type == "msp") {
temp.ms2.data <- readMSP(file = file.path(old.path, temp.ms2.data))
} else{
temp.ms2.data <-
masstools::read_mgf(file = file.path(old.path, temp.ms2.data))
}
temp.ms2.data
})
names(ms2.data) <- ms2.data.name
###prepare data for metIdentification function
message(crayon::green("Preparing MS2 data for identification..."))
ms2.data <-
mapply(
FUN = function(temp.ms2.data, temp.ms2.data.name) {
temp.ms2.data <- lapply(temp.ms2.data, function(x) {
info <- x$info
info <-
data.frame(
name = paste("mz", info[1], "rt", info[2], sep = ""),
"mz" = info[1],
"rt" = info[2],
"file" = temp.ms2.data.name,
stringsAsFactors = FALSE
)
rownames(info) <- NULL
x$info <- info
x
})
temp.ms2.data
},
temp.ms2.data = ms2.data,
temp.ms2.data.name = ms2.data.name
)
if (is.matrix(ms2.data)) {
ms2.data <- ms2.data[, 1]
} else{
ms2.data <- do.call(what = c, args = ms2.data)
}
}
ms1.info <- lapply(ms2.data, function(x) {
x[[1]]
})
ms2.info <- lapply(ms2.data, function(x) {
x[[2]]
})
ms1.info <- do.call(what = rbind, args = ms1.info)
ms1.info <- as.data.frame(ms1.info)
rownames(ms1.info) <- NULL
duplicated.name <-
unique(ms1.info$name[duplicated(ms1.info$name)])
if (length(duplicated.name) > 0) {
lapply(duplicated.name, function(x) {
ms1.info$name[which(ms1.info$name == x)] <-
paste(x, seq_len(sum(ms1.info$name == x)), sep = "_")
})
}
names(ms2.info) <- ms1.info$name
##save intermediate data
save(ms1.info,
file = file.path(intermediate_path, "ms1.info"),
compress = "xz")
save(ms2.info,
file = file.path(intermediate_path, "ms2.info"),
compress = "xz")
}
database_class =
purrr::map(parameter.list, function(x) {
class(x$database)
}) %>%
unlist()
database.name <-
unlist(lapply(parameter.list, function(x) {
if (!is(x$database, "databaseClass")) {
x$database
} else{
paste(x$database@database.info$Source,
x$database@database.info$Version,
sep = "_")
}
}))
##check databases with same names
database.name = make.unique(names = database.name, sep = "_")
##output database information to intermediate_data path
database_info =
data.frame(database.name, database_class, parameter = seq_along(database.name))
write.csv(
database_info,
file = file.path(intermediate_path, "database_info.csv"),
row.names = FALSE
)
if (!all(database.name[database_class != "databaseClass"] %in% dir(old.path))) {
stop(
"The database: ",
paste(database.name[!database.name %in% dir(old.path)], collapse = ", "),
"\n",
" you want to use are not in you directory: \n",
old.path
)
}
identification.result <-
vector(mode = "list", length = length(database.name))
names(identification.result) <- database.name
for (i in seq_along(database.name)) {
message(crayon::yellow("-------------------------------"))
message(crayon::yellow('Database ', i, ": ", database.name[i]))
message(crayon::yellow("-------------------------------"))
new.path <-
file.path(path, paste(database.name[i], "Result", sep = '_'))
dir.create(new.path, showWarnings = FALSE)
if (any(dir(new.path) == "result")) {
load(file.path(new.path, "result"))
identification.result[[i]] <- result
rm(list = "result")
next()
}
if (is(parameter.list[[i]]$database, "databaseClass")) {
temp_database =
parameter.list[[i]]$database
} else{
temp_database <-
load(file.path(old.path, parameter.list[[i]]$database))
temp_database <-
get(temp_database)
}
if (length(temp_database@spectra.data) == 0) {
rm(list = parameter.list[[i]]$database)
result <- mzIdentify(
ms1.data = ms1.data.name,
ms1.match.ppm = parameter.list[[i]]$ms1.match.ppm,
rt.match.tol = parameter.list[[i]]$rt.match.tol,
polarity = parameter.list[[i]]$polarity,
column = parameter.list[[i]]$column,
path = old.path,
candidate.num = parameter.list[[i]]$candidate.num,
database = parameter.list[[i]]$database,
threads = parameter.list[[i]]$threads,
silence.deprecated = TRUE
)
} else{
# rm(list = parameter.list[[i]]$database)
result <- identify_metabolites(
ms1.data = ms1.data.name,
ms2.data = ms2.data.name,
ms1.ms2.match.mz.tol = parameter.list[[i]]$ms1.ms2.match.mz.tol,
ms1.ms2.match.rt.tol = parameter.list[[i]]$ms1.ms2.match.rt.tol,
ms1.match.ppm = parameter.list[[i]]$ms1.match.ppm,
ms2.match.ppm = parameter.list[[i]]$ms2.match.ppm,
mz.ppm.thr = parameter.list[[i]]$mz.ppm.thr,
ms2.match.tol = parameter.list[[i]]$ms2.match.tol,
fraction.weight = parameter.list[[i]]$fraction.weight,
dp.forward.weight = parameter.list[[i]]$dp.forward.weight,
dp.reverse.weight = parameter.list[[i]]$dp.reverse.weight,
rt.match.tol = parameter.list[[i]]$rt.match.tol,
polarity = parameter.list[[i]]$polarity,
ce = parameter.list[[i]]$ce,
column = parameter.list[[i]]$column,
ms1.match.weight = parameter.list[[i]]$ms1.match.weight,
rt.match.weight = parameter.list[[i]]$rt.match.weight,
ms2.match.weight = parameter.list[[i]]$ms2.match.weight,
path = old.path,
total.score.tol = parameter.list[[i]]$total.score.tol,
candidate.num = parameter.list[[i]]$candidate.num,
database = parameter.list[[i]]$database,
threads = parameter.list[[i]]$threads
)
}
# unlink(x = new.path, recursive = TRUE, force = TRUE)
identification.result[[i]] <- result
save(result, file = file.path(new.path, "result"))
rm(list = "result")
}
invisible(identification.result)
}
#' @title Generate the parameter list for identify_metabolites function
#' @description Generate the parameter list for metIdentify function.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ms1.ms2.match.mz.tol MS1 peak and MS2 spectrum matching m/z tolerance. Default is 25 pm.
#' @param ms1.ms2.match.rt.tol MS1 peak and MS2 spectrum matching RT tolerance. Default is 10 s.
#' @param ms1.match.ppm Precursor match ppm tolerance.
#' @param ms2.match.ppm Fragment ion match ppm tolerance.
#' @param mz.ppm.thr Accurate mass tolerance for m/z error calculation.
#' @param ms2.match.tol MS2 match (MS2 similarity) tolerance.
#' @param fraction.weight The weight for matched fragments.
#' @param dp.forward.weight Forward dot product weight.
#' @param dp.reverse.weight Reverse dot product weight.
#' @param rt.match.tol RT match tolerance.
#' @param polarity The polarity of data, "positive"or "negative".
#' @param ce Collision energy. Please confirm the CE values in your database. Default is "all".
#' @param column "hilic" (HILIC column) or "rp" (reverse phase).
#' @param ms1.match.weight The weight of MS1 match for total score calculation.
#' @param rt.match.weight The weight of RT match for total score calculation.
#' @param ms2.match.weight The weight of MS2 match for total score calculation.
#' @param total.score.tol Total score tolerance. The total score are refering to MS-DIAL.
#' @param candidate.num The number of candidate.
#' @param database MS2 database name or MS2 database.
#' @param threads Number of threads
#' @return A metIdentifyClass object.
#' @export
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/multiple_databases.html}
#' @examples
#' param1 <-
#' identify_metabolites_params(
#' ms1.match.ppm = 15,
#' rt.match.tol = 15,
#' polarity = "positive",
#' ce = "all",
#' column = "rp",
#' total.score.tol = 0.5,
#' candidate.num = 3,
#' threads = 3,
#' database = "msDatabase_rplc0.0.2"
#' )
#' param1
identify_metabolites_params <-
function(ms1.ms2.match.mz.tol = 25,
ms1.ms2.match.rt.tol = 10,
ms1.match.ppm = 25,
ms2.match.ppm = 30,
mz.ppm.thr = 400,
ms2.match.tol = 0.5,
fraction.weight = 0.3,
dp.forward.weight = 0.6,
dp.reverse.weight = 0.1,
rt.match.tol = 30,
polarity = c("positive", "negative"),
ce = "all",
column = c("hilic", "rp"),
ms1.match.weight = 0.25,
rt.match.weight = 0.25,
ms2.match.weight = 0.5,
total.score.tol = 0.5,
candidate.num = 3,
database,
threads = 3) {
message("This function is deprecated")
if (missing(database)) {
stop("The database or database name must be provided.\n")
}
polarity <- match.arg(polarity)
column <- match.arg(column)
param <-
list(
ms1.ms2.match.mz.tol = ms1.ms2.match.mz.tol,
ms1.ms2.match.rt.tol = ms1.ms2.match.rt.tol,
ms1.ms2.match.mz.tol = ms1.ms2.match.mz.tol,
ms1.ms2.match.rt.tol = ms1.ms2.match.rt.tol,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
mz.ppm.thr = mz.ppm.thr,
ms2.match.tol = ms2.match.tol,
fraction.weight = fraction.weight,
dp.forward.weight = dp.forward.weight,
dp.reverse.weight = dp.reverse.weight,
rt.match.tol = rt.match.tol,
polarity = polarity,
ce = ce,
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
database = database,
threads = threads
)
list("metIdentifyParam" = param)
}
#' @title Identify metabolites based on MS1 or MS/MS database
#' @description Identify metabolites based on MS1 or MS/MS database.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ms1.data The name of ms1 peak table (csv format). Column 1 is "name", Column 2 is
#' "mz" and column is "rt" (second).
#' @param ms2.data MS2 data, must be mgf, msp or mzXML format. For example, ms2.data = c("test.mgf", "test2.msp").
#' @param ms1.ms2.match.mz.tol MS1 peak and MS2 spectrum matching m/z tolerance. Default is 25 pm.
#' @param ms1.ms2.match.rt.tol MS1 peak and MS2 spectrum matching RT tolerance. Default is 10 s.
#' @param ms1.match.ppm Precursor match ppm tolerance.
#' @param ms2.match.ppm Fragment ion match ppm tolerance.
#' @param mz.ppm.thr Accurate mass tolerance for m/z error calculation.
#' @param ms2.match.tol MS2 match (MS2 similarity) tolerance.
#' @param fraction.weight The weight for matched fragments.
#' @param dp.forward.weight Forward dot product weight.
#' @param dp.reverse.weight Reverse dot product weight.
#' @param rt.match.tol RT match tolerance.
#' @param polarity The polarity of data, "positive"or "negative".
#' @param ce Collision energy. Please confirm the CE values in your database. Default is "all".
#' @param column "hilic" (HILIC column) or "rp" (reverse phase).
#' @param ms1.match.weight The weight of MS1 match for total score calculation.
#' @param rt.match.weight The weight of RT match for total score calculation.
#' @param ms2.match.weight The weight of MS2 match for total score calculation.
#' @param path Work directory.
#' @param total.score.tol Total score tolerance. The total score are refering to MS-DIAL.
#' @param candidate.num The number of candidate.
#' @param database MS2 database name or MS database.
#' @param threads Number of threads
#' @return A metIdentifyClass object.
#' @importFrom crayon yellow green red bgRed
#' @importFrom magrittr %>%
#' @export
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
identify_metabolites <-
function(ms1.data,
ms2.data = NULL,
ms1.ms2.match.mz.tol = 25,
ms1.ms2.match.rt.tol = 10,
ms1.match.ppm = 25,
ms2.match.ppm = 30,
mz.ppm.thr = 400,
ms2.match.tol = 0.5,
fraction.weight = 0.3,
dp.forward.weight = 0.6,
dp.reverse.weight = 0.1,
rt.match.tol = 30,
polarity = c("positive", "negative"),
ce = "all",
column = c("rp", "hilic"),
ms1.match.weight = 0.25,
rt.match.weight = 0.25,
ms2.match.weight = 0.5,
path = ".",
total.score.tol = 0.5,
candidate.num = 3,
database,
threads = 3) {
###Check data
if (missing(database)) {
stop("No database is provided.\n")
}
if (missing(ms1.data)) {
stop("Please provide MS1 data name.\n")
}
##parameter specification
polarity <- match.arg(polarity)
column <- match.arg(column)
##check ms1.file and ms2.file
file <- dir(path)
if (!all(ms1.data %in% file)) {
stop("MS1 data is not in the directory, please check it.\n")
}
if (!is.null(ms2.data)) {
if (!all(ms2.data %in% file)) {
stop("Some MS2 data are not in the directory, please check it.\n")
}
}
if (!is(database, "databaseClass")) {
if (!all(database %in% file)) {
stop("Database is not in this directory, please check it.\n")
}
}
if (is.null(ms2.data)) {
message(crayon::yellow(
"You don't provide MS2 data, so only use mz and/or RT for matching."
))
mzIdentify(
ms1.data = ms1.data,
rt.match.tol = rt.match.tol,
ms1.match.ppm = ms1.match.ppm,
polarity = polarity,
column = column,
path = path,
candidate.num = candidate.num,
database = database,
threads = threads,
silence.deprecated = TRUE
)
} else{
metIdentify(
ms1.data = ms1.data,
ms2.data = ms2.data,
##only msp and mgf and mz(X)ML are supported
ms1.ms2.match.mz.tol = ms1.ms2.match.mz.tol,
ms1.ms2.match.rt.tol = ms1.ms2.match.rt.tol,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
mz.ppm.thr = mz.ppm.thr,
ms2.match.tol = ms2.match.tol,
fraction.weight = fraction.weight,
dp.forward.weight = dp.forward.weight,
dp.reverse.weight = dp.reverse.weight,
rt.match.tol = rt.match.tol,
polarity = polarity,
ce = ce,
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
path = path,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
database = database,
threads = threads,
silence.deprecated = TRUE
)
}
}
#' @title Identify metabolites based on MS/MS database.
#' @description Identify metabolites based on MS/MS database.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ms1.data The name of ms1 peak table (csv format). Column 1 is "name", Column 2 is
#' "mz" and column is "rt" (second).
#' @param ms2.data MS2 data, must be mgf, msp or mzXML format. For example, ms2.data = c("test.mgf", "test2.msp").
#' @param ms1.ms2.match.mz.tol MS1 peak and MS2 spectrum matching m/z tolerance. Default is 25 pm.
#' @param ms1.ms2.match.rt.tol MS1 peak and MS2 spectrum matching RT tolerance. Default is 10 s.
#' @param ms1.match.ppm Precursor match ppm tolerance.
#' @param ms2.match.ppm Fragment ion match ppm tolerance.
#' @param mz.ppm.thr Accurate mass tolerance for m/z error calculation.
#' @param ms2.match.tol MS2 match (MS2 similarity) tolerance.
#' @param fraction.weight The weight for matched fragments.
#' @param dp.forward.weight Forward dot product weight.
#' @param dp.reverse.weight Reverse dot product weight.
#' @param rt.match.tol RT match tolerance.
#' @param polarity The polarity of data, "positive"or "negative".
#' @param ce Collision energy. Please confirm the CE values in your database. Default is "all".
#' @param column "hilic" (HILIC column) or "rp" (reverse phase).
#' @param ms1.match.weight The weight of MS1 match for total score calculation.
#' @param rt.match.weight The weight of RT match for total score calculation.
#' @param ms2.match.weight The weight of MS2 match for total score calculation.
#' @param path Work directory.
#' @param total.score.tol Total score tolerance. The total score are refering to MS-DIAL.
#' @param candidate.num The number of candidate.
#' @param database MS2 database name or MS2 database.
#' @param threads Number of threads
#' @param silence.deprecated Silenc the deprecated information or not.
#' @return A metIdentifyClass object.
#' @importFrom crayon yellow green red bgRed
#' @export
#' @seealso The example and demo data of this function can be found
#' \url{https://tidymass.github.io/metid/articles/metid.html}
metIdentify <-
function(ms1.data,
##csv format
ms2.data = NULL,
##only msp and mgf and mz(X)ML are supported
ms1.ms2.match.mz.tol = 25,
ms1.ms2.match.rt.tol = 10,
ms1.match.ppm = 25,
ms2.match.ppm = 30,
mz.ppm.thr = 400,
ms2.match.tol = 0.5,
fraction.weight = 0.3,
dp.forward.weight = 0.6,
dp.reverse.weight = 0.1,
rt.match.tol = 30,
polarity = c("positive", "negative"),
ce = "all",
column = c("hilic", "rp"),
ms1.match.weight = 0.25,
rt.match.weight = 0.25,
ms2.match.weight = 0.5,
path = ".",
total.score.tol = 0.5,
candidate.num = 3,
database,
threads = 3,
silence.deprecated = FALSE) {
if (!silence.deprecated) {
message("`metIdentify()` is deprecated, use `identify_metabolites()`.")
}
###Check data
if (missing(database)) {
stop("No database is provided.\n")
}
if (missing(ms1.data)) {
stop("Please provide MS1 data name.\n")
}
##parameter specification
polarity <- match.arg(polarity)
column <- match.arg(column)
##check ms1.file and ms2.file
file <- dir(path)
intermediate_path <- file.path(path, "intermediate_data")
dir.create(intermediate_path, showWarnings = FALSE)
if (!all(ms1.data %in% file)) {
stop("MS1 data is not in the directory, please check it.\n")
}
if (!is.null(ms2.data)) {
if (!all(ms2.data %in% file)) {
stop("Some MS2 data are not in the directory, please check it.\n")
}
}
if (!is(database, "databaseClass")) {
if (!all(database %in% file)) {
stop("Database is not in this directory, please check it.\n")
}
}
#load MS2 database
if (!is(database, "databaseClass")) {
database.name <- database
load(file.path(path, database.name))
database <- get(database.name)
} else{
database.name = paste(database@database.info$Source,
database@database.info$Version,
sep = "_")
}
if (!is(database, "databaseClass")) {
stop("database must be databaseClass object\n")
}
ce.list.pos <-
unique(unlist(lapply(
database@spectra.data$Spectra.positive, names
)))
ce.list.neg <-
unique(unlist(lapply(
database@spectra.data$Spectra.negative, names
)))
ce.list <-
ifelse(polarity == "positive", ce.list.pos, ce.list.neg)
if (all(ce %in% ce.list) & ce != "all") {
stop("All ce values you set are not in database. Please check it.\n")
ce <- ce[ce %in% ce.list]
}
rm(list = c("ce.list.pos", "ce.list.neg", "ce.list"))
##ce values
if (all(ce != "all")) {
if (polarity == "positive") {
ce.list <-
unique(unlist(
lapply(database@spectra.data$Spectra.positive, function(x) {
names(x)
})
))
if (length(grep("Unknown", ce.list)) > 0) {
ce <-
unique(c(ce, grep(
pattern = "Unknown", ce.list, value = TRUE
)))
}
} else{
ce.list <-
unique(unlist(
lapply(database@spectra.data$Spectra.negative, function(x) {
names(x)
})
))
if (length(grep("Unknown", ce.list)) > 0) {
ce <-
unique(c(ce, grep(
pattern = "Unknown", ce.list, value = TRUE
)))
}
}
}
##RT in database or not
if (!database@database.info$RT) {
message(crayon::yellow(
"No RT information in database.\nThe weight of RT have been set as 0."
))
}
#------------------------------------------------------------------
##load adduct table
if (polarity == "positive" & column == "hilic") {
data("hilic.pos", envir = environment())
adduct.table <- hilic.pos
}
if (polarity == "positive" & column == "rp") {
data("rp.pos", envir = environment())
adduct.table <- rp.pos
}
if (polarity == "negative" & column == "hilic") {
data("hilic.neg", envir = environment())
adduct.table <- hilic.neg
}
if (polarity == "negative" & column == "rp") {
data("rp.neg", envir = environment())
adduct.table <- rp.neg
}
if (all(c("ms1.info", "ms2.info") %in% dir(intermediate_path))) {
message(crayon::yellow("Use old data."))
load(file.path(intermediate_path, "ms1.info"))
load(file.path(intermediate_path, "ms2.info"))
} else{
##read MS2 data
# cat(crayon::green("Reading MS2 data...\n"))
ms2.data.name <- ms2.data
temp.ms2.type <-
stringr::str_split(string = ms2.data.name, pattern = "\\.")[[1]]
temp.ms2.type <- temp.ms2.type[length(temp.ms2.type)]
if (temp.ms2.type %in% c("mzXML", "mzML")) {
ms2.data <-
masstools::read_mzxml(file = file.path(path, ms2.data.name),
threads = threads)
} else{
ms2.data <- lapply(ms2.data.name, function(temp.ms2.data) {
temp.ms2.type <- stringr::str_split(string = temp.ms2.data, pattern = "\\.")[[1]]
temp.ms2.type <-
temp.ms2.type[length(temp.ms2.type)]
if (!temp.ms2.type %in% c("mgf", "msp"))
stop("We only support mgf or msp.\n")
if (temp.ms2.type == "msp") {
temp.ms2.data <- readMSP(file = file.path(path, temp.ms2.data))
} else{
temp.ms2.data <-
masstools::read_mgf(file = file.path(path, temp.ms2.data))
}
temp.ms2.data
})
names(ms2.data) <- ms2.data.name
###prepare data for metidentification function
message(crayon::green("Preparing MS2 data for identification..."))
ms2.data <-
mapply(
FUN = function(temp.ms2.data, temp.ms2.data.name) {
temp.ms2.data <- lapply(temp.ms2.data, function(x) {
info <- x$info
info <-
data.frame(
name = paste("mz", info[1], "rt", info[2], sep = ""),
"mz" = info[1],
"rt" = info[2],
"file" = temp.ms2.data.name,
stringsAsFactors = FALSE
)
rownames(info) <- NULL
x$info <- info
x
})
temp.ms2.data
},
temp.ms2.data = ms2.data,
temp.ms2.data.name = ms2.data.name
)
if (is(ms2.data, "matrix")) {
ms2.data <- ms2.data[, 1]
} else{
ms2.data <- do.call(what = c, args = ms2.data)
}
}
ms1.info <- lapply(ms2.data, function(x) {
x[[1]]
})
ms2.info <- lapply(ms2.data, function(x) {
x[[2]]
})
ms1.info <- do.call(what = rbind, args = ms1.info)
ms1.info <- as.data.frame(ms1.info)
rownames(ms1.info) <- NULL
duplicated.name <-
unique(ms1.info$name[duplicated(ms1.info$name)])
if (length(duplicated.name) > 0) {
lapply(duplicated.name, function(x) {
ms1.info$name[which(ms1.info$name == x)] <-
paste(x, seq_len(sum(ms1.info$name == x)), sep = "_")
})
}
names(ms2.info) <- ms1.info$name
##save intermediate data
save(ms1.info,
file = file.path(intermediate_path, "ms1.info"),
compress = "xz")
save(ms2.info,
file = file.path(intermediate_path, "ms2.info"),
compress = "xz")
message(crayon::red("OK."))
}
if (!missing(ms1.data)) {
message(crayon::green("Matching peak table with MS2 spectrum..."))
##check ms1 data format
if (length(grep("csv", ms1.data)) == 0) {
stop("Only support csv format ms1 data.\n")
}
ms1.data <-
readr::read_csv(file = file.path(path, ms1.data),
col_types = readr::cols())
##check for the ms1 data
if (ncol(ms1.data) < 3) {
stop(
"MS1 data should have there columns.
See here: \n https://tidymass.github.io/metid/articles/metabolite_annotation_using_MS1.html"
)
}
if (colnames(ms1.data)[1] != "name" |
colnames(ms1.data)[2] != "mz" |
colnames(ms1.data)[3] != "rt") {
stop("The columns should be name, mz and rt, respectively.\n")
}
colnames(ms1.data)[seq_len(3)] <- c("name", "mz", "rt")
match.result <-
masstools::mz_rt_match(
data1 = ms1.data[, c(2, 3)],
data2 = ms1.info[, c(2, 3)],
mz.tol = ms1.ms2.match.mz.tol,
rt.tol = ms1.ms2.match.rt.tol,
rt.error.type = "abs"
)
if (is.null(match.result))
return("No peaks are matched with MS2 spectra.\n")
if (nrow(match.result) == 0)
return("No peaks are matched with MS2 spectra.\n")
message(crayon::green(
length(unique(match.result[, 1])),
"out of",
nrow(ms1.data),
"peaks have MS2 spectra."
))
###if one peak matches multiple peaks, select the more reliable MS2 spectrum
message(crayon::green("Selecting the most intense MS2 spectrum for each peak..."))
temp.idx <- unique(match.result[, 1])
match.result <- lapply(temp.idx, function(idx) {
idx2 <- match.result[which(match.result[, 1] == idx), 2]
if (length(idx2) == 1) {
return(c(idx, idx2))
} else{
temp.ms2.info <- ms2.info[idx2]
return(c(idx, idx2[which.max(unlist(lapply(temp.ms2.info, function(y) {
y <- y[order(y[, 2], decreasing = TRUE), , drop = FALSE]
if (nrow(y) > 5)
y <- y[seq_len(5), ]
sum(y[, 2])
})))]))
}
})
match.result <- do.call(rbind, match.result)
match.result <- as.data.frame(match.result)
colnames(match.result) <- c("Index1", "Index2")
match.result <- data.frame(match.result,
ms1.data$name[match.result$Index1],
ms1.info$name[match.result$Index2],
stringsAsFactors = FALSE)
colnames(match.result) <-
c("Index1.ms1.data",
"Index.ms2.spectra",
"MS1.peak.name",
"MS2.spectra.name")
ms1.info <-
ms1.info[unique(match.result[, 2]), , drop = FALSE]
ms2.info <- ms2.info[unique(match.result[, 2])]
match.result$Index.ms2.spectra <-
match(match.result$MS2.spectra.name, ms1.info$name)
save(
match.result,
file = file.path(intermediate_path, "match.result"),
compress = "xz"
)
message(crayon::red("OK."))
} else{
stop("Please provide MS1 data name.\n")
}
ms2Matchresult <-
metIdentification(
ms1.info = ms1.info,
ms2.info = ms2.info,
polarity = polarity,
ce = ce,
database = database,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
mz.ppm.thr = mz.ppm.thr,
ms2.match.tol = ms2.match.tol,
rt.match.tol = rt.match.tol,
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
adduct.table = adduct.table,
threads = threads,
fraction.weight = fraction.weight,
dp.forward.weight = dp.forward.weight,
dp.reverse.weight = dp.reverse.weight
)
return.result <- new(
Class = "metIdentifyClass",
ms1.data = ms1.data,
ms1.info = ms1.info,
ms2.info = ms2.info,
identification.result = ms2Matchresult,
match.result = match.result,
adduct.table = adduct.table,
ms1.ms2.match.mz.tol = ms1.ms2.match.mz.tol,
ms1.ms2.match.rt.tol = ms1.ms2.match.rt.tol,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
ms2.match.tol = ms2.match.tol,
rt.match.tol = rt.match.tol,
polarity = polarity,
ce = paste(ce, collapse = ";"),
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
path = path,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
database = database.name,
threads = threads,
version = "1.0.0"
)
message(crayon::bgRed("All done."))
return(return.result)
}
#' ---------------------------------------------------------------------------
#' @title readMSP_MoNA
#' @description Read MSP data from MoNA.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param file The vector of names of ms2 files. MS2 file must be msp. The msp data must from MoNA.
#' @return Return ms2 data. This is a list.
#' @export
readMSP_MoNA <-
function(file) {
message("`readMSP_MoNA()` is deprecated, use `read_msp_mona()`.")
message(crayon::green("Reading MSP data..."))
msp.data <- readr::read_lines(file)
n.null <- which(msp.data == '')
temp.idx1 <- c(1, n.null[-length(n.null)])
temp.idx2 <- n.null - 1
temp.idx <- data.frame(temp.idx1, temp.idx2, stringsAsFactors = FALSE)
rm(list = c("temp.idx1", "temp.idx2"))
gc(verbose = FALSE)
temp.idx <- apply(temp.idx, 1, list)
temp.idx <- lapply(temp.idx, unlist)
temp.idx <- temp.idx[which(unlist(lapply(temp.idx, function(x) {
x[1] != x[2]
})))]
pbapply::pboptions(style = 1)
# fix bug
message(crayon::yellow("Transforming..."))
info.spec <- pbapply::pblapply(temp.idx, function(idx) {
if (idx[1] == idx[2])
return(NULL)
temp.msp.data <- msp.data[idx[1]:idx[2]]
temp.msp.data <- temp.msp.data[temp.msp.data != ""]
info.idx <- grep("[A-Za-z]", temp.msp.data)
temp.info <- temp.msp.data[info.idx]
temp.info <-
stringr::str_split(string = temp.info,
pattern = ":",
n = 2)
temp.info <- do.call(rbind, temp.info)
temp.info <- data.frame(temp.info, stringsAsFactors = FALSE)
temp.info[, 2] <-
stringr::str_trim(temp.info[, 2], side = "both")
temp.info[, 1] <-
stringr::str_trim(temp.info[, 1], side = "both")
colnames(temp.info) <- rownames(temp.info) <- NULL
#combine synons
if (length(grep("Synon", temp.info[, 1])) > 1) {
Synon <-
stringr::str_c(temp.info[stringr::str_which(string = temp.info[, 1], pattern = "Synon"), 2, drop = TRUE], collapse = ";")
temp.info[which(temp.info[, 1] == "Synon")[1], 2] <- Synon
temp.info <- temp.info[!duplicated(temp.info[, 1]), ]
}
rownames(temp.info) <- temp.info[, 1]
temp.info <- temp.info[, -1, drop = FALSE]
temp.spec <- temp.msp.data[-info.idx]
if (length(temp.spec) != 0) {
if (length(grep(" ", temp.spec[1])) == 1) {
temp.spec <- strsplit(temp.spec, split = ' ')
}
if (length(grep("\t", temp.spec[1])) == 1) {
temp.spec <- strsplit(x = temp.spec, split = "\t")
}
temp.spec <- do.call(rbind, temp.spec)
temp.spec <- data.frame(temp.spec, stringsAsFactors = FALSE)
colnames(temp.spec) <- c('mz', 'intensity')
rownames(temp.spec) <- NULL
temp.spec$mz <- as.numeric(as.character(temp.spec$mz))
temp.spec$intensity <- as.numeric(temp.spec$intensity)
temp.spec <- temp.spec[temp.spec$intensity != 0, ]
} else{
temp.spec <- NULL
}
list('info' = temp.info, 'spec' = temp.spec)
})
rm(list = c("msp.data", "temp.idx"))
gc()
##remove NULL
info.spec <- info.spec[!unlist(lapply(info.spec, is.null))]
remove.idx <-
which(unlist(lapply(info.spec, function(x)
is.null(x[[2]]))))
if (length(remove.idx) > 0) {
info.spec <- info.spec[-remove.idx]
}
rm(list = c("remove.idx"))
gc()
info.spec <- info.spec
}
#######------------------------------------------------------------------------
#' @title readMGF
#' @description Read MGF data.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param file The vector of names of ms2 files. MS2 file must be mgf.
#' @return Return ms2 data. This is a list.
#' @export
readMGF <-
function(file) {
message("`readMGF()` is deprecated, use `read_mgf()`.")
pbapply::pboptions(style = 1)
message(crayon::green("Reading MS2 data..."))
# mgf.data.list <- pbapply::pblapply(file, list_mgf)
ms2 <- pbapply::pblapply(file, function(mgf.data) {
mgf.data <- list_mgf(mgf.data)
# nl.spec <- grep('^\\d', mgf.data)
nl.spec <-
lapply(mgf.data, function(x)
grep('^\\d', x))
info.mz <-
lapply(mgf.data, function(x)
grep('^PEPMASS', x, value = TRUE))
info.rt <-
lapply(mgf.data, function(x)
grep('^RTINSECONDS', x, value = TRUE))
info.mz <- unlist(info.mz)
#for orbitrap data, the intensity of precursor ion should be removed
info.mz <-
unlist(lapply(strsplit(x = info.mz, split = " "), function(x)
x[1]))
info.mz <-
as.numeric(gsub(pattern = "\\w+=", "", info.mz))
info.rt <- unlist(info.rt)
info.rt <-
as.numeric(gsub(pattern = "\\w+=", "", info.rt))
if (length(mgf.data) == 1) {
spec <- mapply(function(x, y) {
temp <- do.call(rbind, strsplit(x[y], split = " "))
list(temp)
}, x = mgf.data, y = nl.spec)
} else{
spec <- mapply(function(x, y) {
do.call(rbind, strsplit(x[y], split = " "))
}, x = mgf.data, y = nl.spec)
}
spec <- lapply(spec, function(x) {
temp <- cbind(as.numeric(x[, 1]), as.numeric(x[, 2]))
temp <- matrix(temp, ncol = 2)
# if(nrow(temp) > 0) temp <- temp[temp[,2] >= max(temp[,2])*0.01,]
temp <- matrix(temp, ncol = 2)
colnames(temp) <- c("mz", "intensity")
temp
})
ms2 <- mapply(function(x, y, z) {
info <- c(y, z)
names(info) <- c("mz", "rt")
spectrum <- as.matrix(x)
temp <- list(info, spectrum)
names(temp) <- c("info", "spec")
list(temp)
},
x = spec,
y = info.mz,
z = info.rt)
ms2
})
spec.info <- ms2[[1]]
if (length(ms2) > 1) {
for (i in 2:length(ms2)) {
spec.info <- c(spec.info, ms2[[i]])
}
}
remove.idx <-
which(unlist(lapply(spec.info, function(x)
nrow(x[[2]]))) == 0)
if (length(remove.idx) != 0)
spec.info <- spec.info[-remove.idx]
# ##remove noise
# cat("\n")
# cat("Remove noise of MS/MS spectra...\n")
# spec.info <- pbapply::pblapply(spec.info, function(x){
# temp.spec <- x[[2]]
# temp.spec <- remove_noise(temp.spec)
# x[[2]] <- temp.spec
# x
# })
spec.info <- spec.info
}
#---------------------------------------------------------------------------
#' @title readMSP
#' @description Read MSP data.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param file The vector of names of ms2 files. MS2 file must be msp format.
#' @param silence.deprecated Silenc the deprecated information or not.
#' @return Return ms2 data. This is a list.
#' @export
readMSP <-
function(file, silence.deprecated = TRUE) {
if (!silence.deprecated) {
message("`readMSP()` is deprecated, use `read_msp()`.")
}
msp.data <- readLines(file)
if (length(grep("BEGIN IONS", msp.data)) > 0) {
msp.data <- msp.data[msp.data != ""]
temp.idx1 <- grep("BEGIN IONS", msp.data)
temp.idx2 <- grep("END IONS", msp.data)
if (length(temp.idx2) < length(temp.idx1)) {
temp.idx2 <- c(temp.idx2, length(msp.data))
}
temp.idx <-
purrr::map2(.x = temp.idx1, temp.idx2, function(x, y) {
c(x + 1, y - 1)
})
ms2_spec <- purrr::map(
.x = temp.idx,
.f = function(x) {
temp_spec <- msp.data[x[1]:x[2]]
temp_spec <- temp_spec
spec_info <-
temp_spec[stringr::str_detect(temp_spec, "[A-Za-z]")]
spec <-
temp_spec[!stringr::str_detect(temp_spec, "[A-Za-z]")]
spec_info <- stringr::str_split(spec_info, "\\=") %>%
do.call(rbind, .)
mz <-
as.numeric(spec_info[grep("MASS|MZ", spec_info[, 1]), 2])
rt <-
as.numeric(spec_info[grep("RT|RETETION", spec_info[, 1]), 2])
spec_info <- c(mz = mz, rt = rt)
spec <- purrr::map(
.x = spec,
.f = function(x) {
stringr::str_split(x, " ")[[1]] %>% as.numeric()
}
) %>%
do.call(rbind, .)
spec <-
spec %>% as.matrix()
colnames(spec) <- c("mz", "intensity")
spec <- list(info = spec_info, spec = spec)
spec
}
)
return(ms2_spec)
} else{
# n.tot <- length(msp.data)
n.null <- which(msp.data == '')
temp.idx1 <- c(1, n.null[-length(n.null)])
temp.idx2 <- n.null - 1
temp.idx <- data.frame(temp.idx1, temp.idx2, stringsAsFactors = FALSE)
temp.idx <- apply(temp.idx, 1, list)
temp.idx <- lapply(temp.idx, unlist)
# n.spec <- which(grepl('^\\d', msp.data))
# n.info <- seq(n.tot)[-c(n.spec, n.null)]
pbapply::pboptions(style = 1)
info.spec <- pbapply::pblapply(temp.idx, function(idx) {
temp.msp.data <- msp.data[idx[1]:idx[2]]
temp.msp.data <- temp.msp.data[temp.msp.data != ""]
info.idx <- grep("[A-Za-z]", temp.msp.data)
temp.info <- temp.msp.data[info.idx]
temp.info <- strsplit(temp.info, split = ":")
temp.info <- do.call(rbind, temp.info)
temp.info <- data.frame(temp.info, stringsAsFactors = FALSE)
temp.info[, 2] <- stringr::str_trim(temp.info[, 2])
colnames(temp.info) <- rownames(temp.info) <- NULL
rownames(temp.info) <- temp.info[, 1]
temp.info <- temp.info[, -1, drop = FALSE]
temp.spec <- temp.msp.data[-info.idx]
if (length(temp.spec) != 0) {
if (length(grep(" ", temp.spec[1])) == 1) {
temp.spec <- strsplit(temp.spec, split = ' ')
}
if (length(grep("\t", temp.spec[1])) == 1) {
temp.spec <- strsplit(x = temp.spec, split = "\t")
}
temp.spec <- do.call(rbind, temp.spec)
temp.spec <- data.frame(temp.spec, stringsAsFactors = FALSE)
colnames(temp.spec) <- c('mz', 'intensity')
rownames(temp.spec) <- NULL
temp.spec$mz <- as.numeric(as.character(temp.spec$mz))
temp.spec$intensity <- as.numeric(temp.spec$intensity)
temp.spec <- temp.spec[temp.spec$intensity != 0, ]
} else{
temp.spec <- NULL
}
list('info' = temp.info, 'spec' = temp.spec)
})
mz.idx <- grep("[Mm][Zz]", rownames(info.spec[[1]][[1]]))
rt.idx <-
grep("Time|TIME|time|RT|rt|Rt", rownames(info.spec[[1]][[1]]))
##fix bug in msp data from metAnalyzer
if (length(rt.idx) == 0) {
message(crayon::yellow("The msp data are from MetAnalyzer software."))
rt.idx <-
grep("NAME|Name|name", rownames(info.spec[[1]][[1]]))
##rt.idx is the name of peak
info.spec <- lapply(info.spec, function(x) {
info <- x[[1]]
mz <- as.numeric(info[mz.idx, 1])
rt <- as.character(info[rt.idx, 1])
info <- c(mz, rt)
names(info) <- c("mz", "rt")
x[[1]] <- info
x
})
} else{
info.spec <- lapply(info.spec, function(x) {
info <- x[[1]]
mz <- as.numeric(info[mz.idx, 1])
rt <- as.numeric(info[rt.idx, 1])
info <- c(mz, rt)
names(info) <- c("mz", "rt")
x[[1]] <- info
x
})
}
}
remove.idx <-
which(unlist(lapply(info.spec, function(x)
is.null(x[[2]]))))
if (length(remove.idx) > 0) {
info.spec <- info.spec[-remove.idx]
}
info.spec <- info.spec
}
#' @title readMZXML
#' @description Read mzXML data.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param file The vector of names of ms2 files. MS2 file must be mzXML or mzML.
#' @param threads Thread number
#' @return Return ms2 data. This is a list.
#' @export
readMZXML <-
function(file, threads = 3) {
message("`readMZXML()` is deprecated, use `read_mzxml()`.")
# pbapply::pboptions(style = 1)
message(crayon::green("Reading MS2 data..."))
# mzxml.data.list <- pbapply::pblapply(file, list_mgf)
ms2 <-
MSnbase::readMSData(files = file,
msLevel. = 2,
mode = "onDisk")
message(crayon::green("Processing..."))
new.ms2 <- ProtGenerics::spectra(object = ms2)
rm(list = c("ms2"))
#
temp.fun <- function(idx, ms2) {
temp.ms2 <- ms2[[idx]]
rm(list = c("ms2"))
info <-
data.frame(
name = paste("mz", temp.ms2@precursorMz, "rt", temp.ms2@rt, sep = ""),
"mz" = temp.ms2@precursorMz,
"rt" = temp.ms2@rt,
"file" = file[temp.ms2@fromFile],
stringsAsFactors = FALSE
)
duplicated.name <-
unique(info$name[duplicated(info$name)])
if (length(duplicated.name) > 0) {
lapply(duplicated.name, function(x) {
info$name[which(info$name == x)] <-
paste(x, seq_len(sum(info$name == x)), sep = "_")
})
}
rownames(info) <- NULL
spec <- data.frame(
"mz" = temp.ms2@mz,
"intensity" = temp.ms2@intensity,
stringsAsFactors = FALSE
)
list(info = info, spec = spec)
}
new.ms2 <-
BiocParallel::bplapply(
X = seq_along(new.ms2),
FUN = temp.fun,
BPPARAM = BiocParallel::MulticoreParam(workers = threads, progressbar = TRUE),
ms2 = new.ms2
)
new.ms2 <- new.ms2
}
#' @title Identify single peak based on database.
#' @description We can use this function to identify single peak, you can just provide m/z or rt, or you can also provide MS2 spectrum for this peak.
#' @author Xiaotao Shen
#' \email{xiaotao.shen@@outlook.com}
#' @param ms1.mz m/z value of the peaks
#' @param ms1.rt rt value of the peaks
#' @param ms2 MS2 spectra of the peaks. It must be a two column data frame, the
#' first column is m/z and the second column is the intensity.
#' @param ms1.match.ppm Precursor match ppm tolerance.
#' @param ms2.match.ppm Fragment ion match ppm tolerance.
#' @param mz.ppm.thr Accurate mass tolerance for m/z error calculation.
#' @param ms2.match.tol MS2 match (MS2 similarity) tolerance.
#' @param fraction.weight The weight for matched fragments.
#' @param dp.forward.weight Forward dot product weight.
#' @param dp.reverse.weight Reverse dot product weight.
#' @param rt.match.tol RT match tolerance.
#' @param polarity The polarity of data, "positive"or "negative".
#' @param ce Collision energy. Please confirm the CE values in your database. Default is "all".
#' @param column "hilic" (HILIC column) or "rp" (reverse phase).
#' @param ms1.match.weight The weight of MS1 match for total score calculation.
#' @param rt.match.weight The weight of RT match for total score calculation.
#' @param ms2.match.weight The weight of MS2 match for total score calculation.
#' @param path Work directory.
#' @param total.score.tol Total score tolerance. The total score are refering to MS-DIAL.
#' @param candidate.num The number of candidate.
#' @param database MS2 database name.
#' @param threads Number of threads
#' @return A metIdentifyClass object.
#' @export
identify_single_peak <-
function(ms1.mz,
ms1.rt,
ms2,
ms1.match.ppm = 25,
ms2.match.ppm = 30,
mz.ppm.thr = 400,
ms2.match.tol = 0.5,
fraction.weight = 0.3,
dp.forward.weight = 0.6,
dp.reverse.weight = 0.1,
rt.match.tol = 30,
polarity = c("positive", "negative"),
ce = "all",
column = c("hilic", "rp"),
ms1.match.weight = 0.25,
rt.match.weight = 0.25,
ms2.match.weight = 0.5,
path = ".",
total.score.tol = 0.5,
candidate.num = 3,
database,
threads = 3) {
message("This function is deprecated")
###Check data
if (missing(database)) {
stop("No database is provided.\n")
}
if (missing(ms1.mz) | missing(ms1.rt) | missing(ms2)) {
stop("ms1.mz, ms1.rt or ms2 is not provided.\n")
}
##parameter specification
polarity <- match.arg(polarity)
column <- match.arg(column)
#load MS2 database
database.name <- database
load(file.path(path, database.name))
database <- get(database.name)
if (!is(database, "databaseClass")) {
stop("database must be databaseClass object\n")
}
ce.list.pos <-
unique(unlist(lapply(
database@spectra.data$Spectra.positive, names
)))
ce.list.neg <-
unique(unlist(lapply(
database@spectra.data$Spectra.negative, names
)))
ce.list <-
ifelse(polarity == "positive", ce.list.pos, ce.list.neg)
if (all(ce %in% ce.list) & ce != "all") {
stop("All ce values you set are not in database. Please check it.\n")
ce <- ce[ce %in% ce.list]
}
rm(list = c("ce.list.pos", "ce.list.neg", "ce.list"))
##ce values
if (all(ce != "all")) {
if (polarity == "positive") {
ce.list <-
unique(unlist(
lapply(database@spectra.data$Spectra.positive, function(x) {
names(x)
})
))
if (length(grep("Unknown", ce.list)) > 0) {
ce <-
unique(c(ce, grep(
pattern = "Unknown", ce.list, value = TRUE
)))
}
} else{
ce.list <-
unique(unlist(
lapply(database@spectra.data$Spectra.negative, function(x) {
names(x)
})
))
if (length(grep("Unknown", ce.list)) > 0) {
ce <-
unique(c(ce, grep(
pattern = "Unknown", ce.list, value = TRUE
)))
}
}
}
##RT in database or not
if (!database@database.info$RT) {
message(crayon::yellow(
"No RT information in database.\nThe weight of RT have been set as 0."
))
}
#------------------------------------------------------------------
##load adduct table
if (polarity == "positive" & column == "hilic") {
data("hilic.pos", envir = environment())
adduct.table <- hilic.pos
}
if (polarity == "positive" & column == "rp") {
data("rp.pos", envir = environment())
adduct.table <- rp.pos
}
if (polarity == "negative" & column == "hilic") {
data("hilic.neg", envir = environment())
adduct.table <- hilic.neg
}
if (polarity == "negative" & column == "rp") {
data("rp.neg", envir = environment())
adduct.table <- rp.neg
}
name <- paste(paste("mz", ms1.mz, sep = ""), paste("rt", ms1.rt, sep = ""), sep = "")
file <- "test"
ms1.info <- data.frame(name,
mz = ms1.mz,
rt = ms1.rt,
file,
stringsAsFactors = FALSE)
if (is.matrix(ms2) | is.data.frame(ms2)) {
ms2.info <- list(ms2)
} else{
ms2.info <- ms2
}
names(ms2.info) <- ms1.info$name
if (c(length(ms1.mz), length(ms1.rt), length(ms2.info)) %>%
unique() %>%
length() != 1) {
stop("Length of ms1.mz, ms1.rt and ms2 must be same.\n")
}
ms2Matchresult <-
metIdentification(
ms1.info = ms1.info,
ms2.info = ms2.info,
polarity = polarity,
ce = ce,
database = database,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
mz.ppm.thr = mz.ppm.thr,
ms2.match.tol = ms2.match.tol,
rt.match.tol = rt.match.tol,
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
adduct.table = adduct.table,
threads = threads,
fraction.weight = fraction.weight,
dp.forward.weight = dp.forward.weight,
dp.reverse.weight = dp.reverse.weight
)
ms1.data <- ms1.info %>%
dplyr::select(-file)
match.result <- data.frame(
Index1.ms1.data = seq_len(nrow(ms1.info)),
Index.ms2.spectra = seq_len(nrow(ms1.info)),
MS1.peak.name = ms1.info$name,
MS2.spectra.name = ms1.info$name,
stringsAsFactors = FALSE
)
return.result <- new(
Class = "metIdentifyClass",
ms1.data = ms1.data,
ms1.info = ms1.info,
ms2.info = ms2.info,
identification.result = ms2Matchresult,
match.result = match.result,
adduct.table = adduct.table,
ms1.ms2.match.mz.tol = 0,
ms1.ms2.match.rt.tol = 0,
ms1.match.ppm = ms1.match.ppm,
ms2.match.ppm = ms2.match.ppm,
ms2.match.tol = ms2.match.tol,
rt.match.tol = rt.match.tol,
polarity = polarity,
ce = paste(ce, collapse = ";"),
column = column,
ms1.match.weight = ms1.match.weight,
rt.match.weight = rt.match.weight,
ms2.match.weight = ms2.match.weight,
path = path,
total.score.tol = total.score.tol,
candidate.num = candidate.num,
database = database.name,
threads = threads,
version = "1.0.0"
)
message(crayon::bgRed("All done."))
return(return.result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.