#' @title Extract compound data from a file in SDF format
#'
#' @description
#'
#' `compound_tbl_sdf()` extracts basic compound annotations from a file in SDF
#' format (structure-data file). The function currently supports SDF files from:
#'
#' - HMDB (Human Metabolome Database): http://www.hmdb.ca
#' - ChEBI (Chemical Entities of Biological Interest): http://ebi.ac.uk/chebi
#' - LMSD (LIPID MAPS Structure Database): http://www.lipidmaps.org
#' - PubChem: https://pubchem.ncbi.nlm.nih.gov/
#' - MoNa: http://mona.fiehnlab.ucdavis.edu/ (see notes below!)
#'
#' @details
#'
#' Column `"name"` reports for HMDB files the `"GENERIC_NAME"`, for
#' ChEBI the `"ChEBI Name"`, for PubChem the `"PUBCHEM_IUPAC_TRADITIONAL_NAME"`,
#' and for Lipid Maps the `"COMMON_NAME"`, if that is
#' not available, the first of the compounds synonyms and, if that is also not
#' provided, the `"SYSTEMATIC_NAME"`.
#'
#' @note
#'
#' `compound_tbl_sdf()` supports also to read/process gzipped files.
#'
#' MoNa SDF files organize the data by individual spectra (i.e. each element
#' is one spectrum) and individual compounds can not easily and consistently
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compound
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the
#' field names were changed.
#'
#' @param file `character(1)` with the name of the SDF file.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#' values in the columns `"synonyms"`. See examples for details.
#'
#' @param onlyValid `logical(1)` to import only valid or all elements (defaults
#' to `onlyValid = TRUE`)
#'
#' @param nonStop `logical(1)` whether file content specific errors should
#' only reported as warnings and not break the full import process. The
#' value of this parameter is passed to parameter `skipErrors` of the
#' [read.SDFset()] function.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#'
#' - `compound_id`: the ID of the compound.
#' - `name`: the compound's name.
#' - `inchi`: the InChI of the compound.
#' - `inchikey`: the InChI key.
#' - `formula`: the chemical formula.
#' - `exactmass`: the compound's (monoisotopic exact) mass.
#' - `synonyms`: the compound's synonyms (aliases). This type of this column is
#' by default a `list` to support multiple aliases per compound, unless
#' argument `collapse` is provided, in which case multiple synonyms are pasted
#' into a single element separated by the value of `collapse`.
#' - `smiles`: the compound's SMILES (if provided).
#'
#' @family compound table creation functions
#'
#' @author Johannes Rainer and Jan Stanstrup
#'
#' @export
#'
#' @seealso [createCompDb()] for a function to create a SQLite-based compound
#' database.
#'
#' @importFrom ChemmineR read.SDFset datablock datablock2ma validSDF
#'
#' @examples
#'
#' ## Read compound information from a subset of HMDB
#' fl <- system.file("sdf/HMDB_sub.sdf.gz", package = "CompoundDb")
#' cmps <- compound_tbl_sdf(fl)
#' cmps
#'
#' ## Column synonyms contains a list
#' cmps$synonyms
#'
#' ## If we provide the optional argument collapse, multiple entries will be
#' ## collapsed.
#' cmps <- compound_tbl_sdf(fl, collapse = "|")
#' cmps
#' cmps$synonyms
compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
nonStop = TRUE) {
.check_parameter_file(file)
suppressWarnings(
sdf <- read.SDFset(file, skipErrors = nonStop))
nsdf <- length(sdf)
if (onlyValid) {
sdf <- sdf[validSDF(sdf)]
if (length(sdf) != nsdf)
message("Skipped import of ", nsdf - length(sdf),
" invalid elements")
}
res <- .simple_extract_compounds_sdf(datablock2ma(datablock(sdf)))
if (!missing(collapse)) {
## collapse elements from lists.
res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
FUN.VALUE = "character")
}
res
}
#' @title Extract compound data from LipidBlast
#'
#' @description
#'
#' `compound_tbl_lipidblast()` extracts basic compound annotations from a
#' LipidBlast file in (json format) downloaded from
#' http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data
#' is extracted from the json file.
#'
#' @param file `character(1)` with the name of the file name.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#' values in the columns `"synonyms"`. See examples for details.
#'
#' @param n `integer(1)` defining the number of rows from the json file that
#' should be read and processed at a time. By default (`n = -1L`) the
#' complete file is imported and processed. For large json files it is
#' suggested to set e.g. `n = 100000` to enable chunk-wise processing and
#' hence reduce the memory demand.
#'
#' @param verbose `logical(1)` whether some progress information should be
#' provided. Defaults to `verbose = FALSE`, but for parsing very large
#' files (specifically with chunk-wise processing enabled with `n` > 0)
#' it might be helpful to set to `verbose = TRUE`.
#'
#' @param BPPARAM `BiocParallelParam` object to configure parallel processing.
#' Defaults to `bpparam()`.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#'
#' - `compound_id`: the ID of the compound.
#' - `name`: the compound's name.
#' - `inchi`: the InChI of the compound.
#' - `inchikey`: the InChI key.
#' - `smiles`: the SMILES representation of the compound.
#' - `formula`: the chemical formula.
#' - `exactmass`: the compound's mass.
#' - `compound_class`: the class of the compound.
#' - `ionization_mode`: the ionization mode.
#' - `precursor_mz`: the precursor m/z value.
#' - `precursor_type`: the precursor type.
#' - `retention_time`: the retention time.
#' - `ccs`: the collision cross-section.
#' - `spectrum`: the spectrum data (i.e. the mass peaks, as a concatenated
#' character string).
#' - `synonyms`: the compound's synonyms (aliases). This type of this column is
#' by default a `list` to support multiple aliases per compound, unless
#' argument `collapse` is provided, in which case multiple synonyms are pasted
#' into a single element separated by the value of `collapse`.
#'
#' @family compound table creation functions
#'
#' @author Johannes Rainer, Jan Stanstrup and Prateek Arora
#'
#' @export
#'
#' @importFrom BiocParallel bpparam
#'
#' @examples
#'
#' ## Read compound information from a subset of HMDB
#' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb")
#' cmps <- compound_tbl_lipidblast(fl, n = 50000, verbose = TRUE)
#' cmps
compound_tbl_lipidblast <- function(file, collapse = character(), n = -1L,
verbose = FALSE, BPPARAM = bpparam()) {
.check_parameter_file(file)
res <- .import_lipidblast(file, n = n, verbose = verbose, BPPARAM = BPPARAM)
if (length(collapse)) {
## collapse elements from lists.
res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse[1L],
FUN.VALUE = "character")
}
res
}
.check_parameter_file <- function(file) {
if (missing(file))
stop("Please provide the file name using 'file'")
if (!file.exists(file))
stop("Can not find file ", file)
}
#' @description
#'
#' Internal function to extract compound information from a file in SDF format.
#'
#' @param x what is returned by datablock2ma(datablock(read.SDFset)).
#'
#' @return A [tibble::tibble] with columns `"compound_id"`, `"name"`,
#' `"inchi"`, `"formula"`, `"exactmass"`.
#'
#' @note
#'
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the
#' field names were changed.
#'
#' @importFrom tibble tibble
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @noRd
.simple_extract_compounds_sdf <- function(x) {
source_db <- .guess_sdf_source(colnames(x))
if (is.null(source_db))
stop("The SDF file is not supported. Supported are SDF files from ",
"HMDB, ChEBI and LipidMaps.")
colmap <- get(paste0(".", source_db, "_colmap"))
sep <- get(paste0(".", source_db, "_separator"))
## Fix missing COMMON_NAME entries in LipidMaps (see issue #1)
nms <- x[, colmap["name"]]
syns <- strsplit(x[, colmap["synonyms"]], split = sep)
if (source_db == "lipidmaps") {
nas <- is.na(nms)
if (any(nas))
nms[nas] <- vapply(syns[nas], `[[`, 1, FUN.VALUE = "character")
nas <- is.na(nms)
if (any(nas))
nms[nas] <- x[nas, "SYSTEMATIC_NAME"]
}
res <- tibble(compound_id = x[, colmap["id"]],
name = nms,
inchi = x[, colmap["inchi"]],
inchikey = x[, colmap["inchikey"]],
formula = x[, colmap["formula"]],
exactmass = as.numeric(x[, colmap["exactmass"]]),
synonyms = syns
)
if (is.na(colmap["smiles"])) {
res$smiles <- NA_character_
} else
res$smiles <- x[, colmap["smiles"]]
if (source_db == "mona") {
warning("MoNa data can currently not be normalized and the ",
"compound table contains thus highly redundant data.",
call. = FALSE)
## Eventually reduce and normalize.
inchis <- .extract_field_from_string(x[, "COMMENT"], "InChI")
nona <- !is.na(inchis)
inchis[nona] <- paste0("InChI", inchis[nona])
res$inchi <- inchis
res$compound_id <- .compound_id_from_mona_sdf(x)
res
} else res
}
#' @description
#'
#' Based on the provided `colnames` guess whether the file is from HMDB,
#' ChEBI, LIPID MAPS, PubChem or LipidBlast.
#'
#'
#' @param x `character` with the column names of the data table.
#'
#' @return `character(1)` with the name of the resource or `NULL`.
#'
#' @note
#'
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the
#' field names were changed.
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @noRd
.guess_sdf_source <- function(x) {
if (any(x == "ChEBI ID"))
return("chebi")
if (any(x == "LM_ID"))
return("lipidmaps")
if (any(x == "PUBCHEM_COMPOUND_CID"))
return("pubchem")
if (any(x == "HMDB_ID"))
return("hmdb")
if (any(x == "ID")) {
message("Guessing we've got a SDF file from MoNa.")
return("mona")
}
NULL
}
.hmdb_colmap <- c(id = "HMDB_ID",
name = "GENERIC_NAME",
inchi = "INCHI_IDENTIFIER",
inchikey = "INCHI_KEY",
formula = "FORMULA",
exactmass = "EXACT_MASS",
synonyms = "SYNONYMS",
smiles = "SMILES"
)
.hmdb_separator <- "; "
.chebi_colmap <- c(id = "ChEBI ID",
name = "ChEBI Name",
inchi = "InChI",
inchikey = "InChIKey",
formula = "Formulae",
exactmass = "Monoisotopic Mass",
synonyms = "Synonyms",
smiles = "SMILES"
)
.chebi_separator <- " __ "
.lipidmaps_colmap <- c(id = "LM_ID",
name = "NAME",
inchi = "INCHI",
inchikey = "INCHI_KEY",
formula = "FORMULA",
exactmass = "EXACT_MASS",
synonyms = "SYNONYMS",
smiles = NA
)
.lipidmaps_separator <- "; "
.pubchem_colmap <- c(id = "PUBCHEM_COMPOUND_CID",
name = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
inchi = "PUBCHEM_IUPAC_INCHI",
inchikey = "PUBCHEM_IUPAC_INCHIKEY",
formula = "PUBCHEM_MOLECULAR_FORMULA",
exactmass = "PUBCHEM_EXACT_MASS",
synonyms = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
smiles = "PUBCHEM_OPENEYE_CAN_SMILES"
# Others:
# PUBCHEM_IUPAC_SYSTEMATIC_NAME,
# PUBCHEM_IUPAC_CAS_NAME,
# PUBCHEM_IUPAC_OPENEYE_NAME,
# PUBCHEM_IUPAC_NAME
)
.pubchem_separator <- "; " # there seems to be none
.mona_colmap <- c(id = "ID",
name = "NAME",
inchi = "INCHIKEY",
inchikey = "INCHIKEY",
formula = "FORMULA",
exactmass = "EXACT MASS",
synonyms = "SYNONYMS",
smiles = NA
)
.mona_separator <- " __ "
#' @description Import compound information from a LipidBlast file in json
#' format.
#'
#' @note This is a modified version from Jan's generate_lipidblast_tbl that
#' extracts the mass also from the json and does not calculate it.
#'
#' @author Jan Stanstrup, Johannes Rainer and Prateek Arora
#'
#' @importFrom jsonlite read_json
#'
#' @importFrom dplyr bind_rows
#' @importFrom BiocParallel bplapply bpparam
#'
#' @md
#' @noRd
.import_lipidblast <- function(file, n = -1L, verbose = FALSE,
BPPARAM = bpparam()) {
if (n < 0) {
lipidb <- read_json(file)
if (verbose)
message("Processing ", length(lipidb), " elements ...",
appendLF = FALSE)
## Use BiocParallel to process elements in parallel
res <- bplapply(lipidb, .parse_lipidblast_json_element,
BPPARAM = BPPARAM)
if (verbose) message(" done.")
} else {
res <- .import_lipidblast_json_chunk(file, n = n, verbose = verbose,
BPPARAM = BPPARAM)
}
bind_rows(res)
}
#' @description Parse a single LipidBlast JSON element.
#'
#' @param x A JSON element representing a compound.
#' @return A list containing parsed compound information.
#' @noRd
.parse_lipidblast_json_element <- function(x) {
id <- x$id
cmp <- x$compound[[1]]
## Helper function to extract metadata from multiple sources
get_metadata <- function(name, sources) {
for (source in sources) {
value <- unlist(lapply(source, function(z) {
if (z$name == name) z$value
}))
if (length(value) > 0 && !all(value == "")) {
return(value[1])
}
}
return(NA_character_)
}
## Define metadata sources
metadata_sources <- list(x$metaData, cmp$metaData)
## Get names
nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
snms <- if (length(nms) > 1L) nms[-1L] else NA_character_
## Extract metadata
mass <- get_metadata("total exact mass", metadata_sources)
mass <- as.numeric(mass)
if (is.na(mass)) mass <- NA_real_
frml <- get_metadata("molecular formula", metadata_sources)
inchi <- get_metadata("InChI", metadata_sources)
inchikey <- get_metadata("InChIKey", metadata_sources)
smiles <- get_metadata("SMILES", metadata_sources)
compound_class <- get_metadata("compound class", metadata_sources)
ionization_mode <- get_metadata("ionization mode", metadata_sources)
precursor_mz <- as.numeric(get_metadata("precursor m/z", metadata_sources))
precursor_type <- get_metadata("precursor type", metadata_sources)
retention_time <- as.numeric(get_metadata("retention time", metadata_sources))
ccs <- as.numeric(get_metadata("ccs", metadata_sources))
## Extract spectrum data
spectrum <- x$spectrum
if (is.null(spectrum)) spectrum <- NA_character_
list(
compound_id = id,
name = nms[1],
inchi = inchi,
inchikey = inchikey,
smiles = smiles,
formula = frml,
exactmass = mass,
compound_class = compound_class,
ionization_mode = ionization_mode,
precursor_mz = precursor_mz,
precursor_type = precursor_type,
retention_time = retention_time,
ccs = ccs,
spectrum = spectrum,
synonyms = list(snms)
)
}
#' @description Import compound information from a chunk of a LipidBlast file in JSON format.
#'
#' @param x The file path to the LipidBlast JSON file.
#' @param n The number of lines to read in each chunk.
#' @param verbose Logical, if TRUE, progress messages are printed.
#' @param BPPARAM A BiocParallelParam object specifying the parallel backend.
#' @return A list of parsed compound information.
#' @importFrom jsonlite fromJSON
#' @importFrom dplyr bind_rows
#' @importFrom BiocParallel bplapply bpparam
#' @noRd
.import_lipidblast_json_chunk <- function(x, n = 10000, verbose = FALSE,
BPPARAM = bpparam()) {
con <- file(x, open = "r")
on.exit(close(con))
res <- list()
while (length(ls <- readLines(con, n = n, warn = FALSE))) {
if (length(grep("^\\[", ls[1L])))
ls <- ls[-1L]
if (length(grep("^\\]", ls[length(ls)])))
ls <- ls[-length(ls)]
ls <- sub(",$", "", ls)
if (length(ls)) {
chunk_res <- bplapply(ls, function(z) {
.parse_lipidblast_json_element(
jsonlite::fromJSON(z, simplifyVector = FALSE))
}, BPPARAM = BPPARAM)
res <- c(res, chunk_res)
}
if (verbose)
message("Processed ", length(ls), " elements")
}
res
}
#' @title Create a CompDb database
#'
#' @description
#'
#' `CompDb` databases can be created with the `createCompDb()` or the
#' `emptyCompDb()` functions, the former creating and initializing (filling)
#' the database with existing data, the latter creating an empty database that
#' can be subsequently filled with [insertCompound()] or [insertSpectra()]
#' calls.
#'
#' `emptyCompDb()` requires only the file name of the database that should be
#' created as input and returns a `CompDb` representing the empty database.
#'
#' `createCompDb()` creates a `SQLite`-based [`CompDb`] object/database
#' from a compound resource provided as a `data.frame` or `tbl`. Alternatively,
#' the name(s) of the file(s) from which the annotation should be extracted can
#' be provided. Supported are SDF files (such as those from the
#' *Human Metabolome Database* HMDB) that can be read using the
#' [compound_tbl_sdf()] or LipidBlast files (see [compound_tbl_lipidblast()].
#'
#' An additional `data.frame` providing metadata information including the data
#' source, date, version and organism is mandatory. By default, the function
#' will define the name of the database based on the provided metadata, but it
#' is also possible to define this manually with the `dbFile` parameter.
#'
#' Optionally MS/MS (MS2) spectra for compounds can be also stored in the
#' database. Currently only MS/MS spectra from HMDB are supported. These can
#' be downloaded in XML format from HMDB (http://www.hmdb.ca), loaded with
#' the [msms_spectra_hmdb()] or [msms_spectra_mona()] function and passed to
#' the function with the `msms_spectra` argument. See [msms_spectra_hmdb()] or
#' [msms_spectra_mona()] for information on the expected columns and format.
#'
#' Required columns for the `data.frame` providing the compound information (
#' parameter `x`) are:
#'
#' - `"compound_id"`: the ID of the compound. Can be an `integer` or
#' `character`. Duplicated IDs are supported (for compatibility reasons), but
#' not suggested. No missing values allowed.
#' - `"name"`: the compound's name.
#' - `"inchi"`: the InChI of the compound.
#' - `"inchikey"`: the InChI key.
#' - `"formula"`: the chemical formula.
#' - `"exactmass"`: the compound's (exact) mass.
#' - `"synonyms"`: additional synonyms/aliases for the compound. Should be
#' either a single character or a list of values for each compound.
#'
#' Any additional columns in the provided `data.frame` (such as e.g. `"smiles"`
#' providing the compound's SMILES) are also supported and will be inserted into
#' the database table.
#'
#' See e.g. [compound_tbl_sdf()] or [compound_tbl_lipidblast()] for functions
#' creating such compound tables.
#'
#' The table containing the MS2 spectra data should have the following format
#' and columns:
#'
#' - `"spectrum_id"`: an arbitrary ID for the spectrum. Has to be an `integer`.
#' - `"compound_id"`: the ID of the compound to which the spectrum can be
#' associated with. This has to be present in the `data.frame` defining the
#' compounds.
#' - `"polarity"`: the polarity (as an `integer`, `0` for negative, `1` for
#' positive, `NA` for not set).
#' - `"collision_energy"`: the collision energy.
#' - `"predicted"`: whether the spectrum was predicted or measured.
#' - `"splash"`: the SPLASH of the spectrum.
#' - `"instrument_type"`: the instrument type.
#' - `"instrument"`: the name of the instrument.
#' - `"precursor_mz"`: the precursor m/z (as a `numeric`).
#' - `"mz"`: the m/z values.
#' - `"intensity"`: the intensity values.
#'
#' Only for columns `"spectrum_id"`, `"compound_id"`, `"mz"` and `"intensity"`
#' a value has to be provided in each row of the `data.frame`. The others are
#' optional. Note that the `data.frame` can be either in the format as in the
#' example below (i.e. each row being one spectrum and columns `"mz"` and
#' `"intensity"` being of type `list` each element being the m/z or intensity
#' values of one spectrum) or in a *full* form, in which each row represents
#' one *peak* and all columns except `"mz"` and `"intensity"` containing
#' redundant information of each spectrum (hence columns `"mz"` and
#' `"intensity"` being of type `numeric`).
#'
#' The metadata `data.frame` is supposed to have two columns named `"name"` and
#' `"value"` providing the following minimal information as key-value pairs
#' (see `make_metadata` for a utility function to create such a `data.frame`):
#'
#' - `"source"`: the source from which the data was retrieved (e.g. `"HMDB"`).
#' - `"url"`: the url from which the original data was retrieved.
#' - `"source_version"`: the version from the original data source
#' (e.g. `"v4"`).
#' - `"source_date"`: the date when the original data source was generated.
#' - `"organism"`: the organism. Should be in the form `"Hsapiens"` or
#' `"Mmusculus"`.
#'
#' @details
#'
#' Metadata information is also used to create the file name for the database
#' file. The name starts with `"CompDb"`, followed by the organism, the
#' data source and its version. A compound database file for HMDB version 4
#' with human metabolites will thus be named: `"CompDb.Hsapiens.HMDB.v4"`.
#'
#' A single `CompDb` database is created from multiple SDF files (e.g. for
#' *PubChem*) if all the file names are provided with parameter `x`. Parallel
#' processing is currently not enabled because SQLite does not support it yet
#' natively.
#'
#' @param x For `createCompDb()`: `data.frame` or `tbl` with the compound
#' annotations or `character` with the file name(s) from which the compound
#' annotations should be retrieved. See description for details.
#'
#' For `createCompDbPackage()`: `character(1)` with the file name of the
#' `CompDb` SQLite file (created by `createCompDb`).
#'
#' @param metadata For `createCompDb()`: `data.frame` with metadata
#' information. See description for details.
#'
#' @param msms_spectra For `createCompDb()`: `data.frame` with MS/MS spectrum
#' data. See [msms_spectra_hmdb()] for the expected format and a function
#' to import such data from spectrum xml files from HMDB.
#'
#' @param path `character(1)` with the path to the directory where the database
#' file or package folder should be written. Defaults to the current
#' directory.
#'
#' @param dbFile `character(1)` to optionally provide the name of the SQLite
#' database file. If not provided (the default) the database name is defined
#' using information from the provided `metadata`.
#'
#' @return For `createCompDb()`: a `character(1)` with the database name
#' (invisibly).
#'
#' @importFrom DBI dbDriver dbWriteTable dbExecute dbDisconnect
#' @importFrom RSQLite dbConnect
#' @importFrom dplyr bind_cols bind_rows
#' @importFrom MsCoreUtils rbindFill
#'
#' @export
#'
#' @author Johannes Rainer
#'
#' @seealso
#'
#' [compound_tbl_sdf()] and [compound_tbl_lipidblast()] for functions
#' to extract compound annotations from files in SDF format, or files from
#' LipidBlast.
#'
#' [import_mona_sdf()] to import both the compound and spectrum data from a
#' SDF file from MoNa (Massbank of North America) in one call.
#'
#' [msms_spectra_hmdb()] and [msms_spectra_mona()] for functions to import
#' MS/MS spectrum data from xml files from HMDB or an SDF file from MoNa.
#'
#' [CompDb()] for how to use a compound database.
#'
#' @examples
#'
#' ## Read compounds for a HMDB subset
#' fl <- system.file("sdf/HMDB_sub.sdf.gz", package = "CompoundDb")
#' cmps <- compound_tbl_sdf(fl)
#'
#' ## Create a metadata data.frame for the compounds.
#' metad <- data.frame(name = c("source", "url", "source_version",
#' "source_date", "organism"), value = c("HMDB", "http://www.hmdb.ca",
#' "v4", "2017-08-27", "Hsapiens"))
#'
#' ## Alternatively use the make_metadata helper function
#' metad <- make_metadata(source = "HMDB", source_version = "v4",
#' source_date = "2017-08", organism = "Hsapiens",
#' url = "http://www.hmdb.ca")
#' ## Create a SQLite database in the temporary folder
#' db_f <- createCompDb(cmps, metadata = metad, path = tempdir())
#'
#' ## The database can be loaded and accessed with a CompDb object
#' db <- CompDb(db_f)
#' db
#'
#' ## Create a database for HMDB that includes also MS/MS spectrum data
#' metad2 <- make_metadata(source = "HMDB_with_spectra", source_version = "v4",
#' source_date = "2017-08", organism = "Hsapiens",
#' url = "http://www.hmdb.ca")
#'
#' ## Import spectrum information from selected MS/MS xml files from HMDB
#' ## that are provided in the package
#' xml_path <- system.file("xml", package = "CompoundDb")
#' spctra <- msms_spectra_hmdb(xml_path)
#'
#' ## Create a SQLite database in the temporary folder
#' db_f2 <- createCompDb(cmps, metadata = metad2, msms_spectra = spctra,
#' path = tempdir())
#'
#' ## The database can be loaded and accessed with a CompDb object
#' db2 <- CompDb(db_f2)
#' db2
#'
#' ## Does the database contain MS/MS spectrum data?
#' hasMsMsSpectra(db2)
#'
#' ## Create a database for a ChEBI subset providing the file name of the
#' ## corresponding SDF file
#' metad <- make_metadata(source = "ChEBI_sub", source_version = "2",
#' source_date = NA, organism = "Hsapiens", url = "www.ebi.ac.uk/chebi")
#' db_f <- createCompDb(system.file("sdf/ChEBI_sub.sdf.gz",
#' package = "CompoundDb"), metadata = metad, path = tempdir())
#' db <- CompDb(db_f)
#' db
#'
#' compounds(db)
#'
#' ## connect to the database and query it's tables using RSQlite
#' library(RSQLite)
#' con <- dbConnect(dbDriver("SQLite"), db_f)
#'
#' dbGetQuery(con, "select * from metadata")
#' dbGetQuery(con, "select * from ms_compound")
#'
#' ## To create a CompDb R-package we could simply use the
#' ## createCompDbPackage function on the SQLite database file name.
createCompDb <- function(x, metadata, msms_spectra, path = ".",
dbFile = character()) {
.valid_metadata(metadata)
if (!length(dbFile))
db_file <- file.path(path, paste0(.db_file_from_metadata(metadata),
".sqlite"))
else db_file <- file.path(path, dbFile)
con <- dbConnect(dbDriver("SQLite"), dbname = db_file)
## Add additional metadata info
metadata$name <- as.character(metadata$name)
metadata$value <- as.character(metadata$value)
metadata <- bind_rows(metadata, c(name = "db_creation_date",
value = date()))
metadata <- bind_rows(metadata, c(name = "supporting_package",
value = "CompoundDb"))
metadata <- bind_rows(metadata, c(name = "supporting_object",
value = "CompDb"))
dbWriteTable(con, name = "metadata", metadata, row.names = FALSE)
## ms_compound table
if (is.character(x)) {
tbls <- snms <- vector("list", length(x))
for (i in seq_along(x)) {
z <- x[i]
message("Import data from ", basename(z), " ...", appendLF = FALSE)
if (.is_sdf_filename(z)) {
tbl <- compound_tbl_sdf(z)
} else {
## Assume this is a json file... might simply be wrong.
tbl <- compound_tbl_lipidblast(z)
}
message("OK")
.valid_compound(tbl, db = FALSE)
snms[[i]] <- bind_cols(compound_id = rep(tbl$compound_id,
lengths(tbl$synonyms)),
synonym = unlist(tbl$synonyms))
tbls[[i]] <- tbl[, colnames(tbl) != "synonyms"]
}
tbls <- do.call(rbindFill, tbls)
snms <- do.call(rbindFill, snms)
dbWriteTable(con, name = "synonym", snms, row.names = FALSE,
append = TRUE)
dbWriteTable(con, name = "ms_compound", row.names = FALSE,
tbls, append = TRUE)
} else {
.valid_compound(x, db = FALSE)
x_synonym <- bind_cols(compound_id = rep(x$compound_id,
lengths(x$synonyms)),
synonym = unlist(x$synonyms))
dbWriteTable(con, name = "synonym", x_synonym, row.names = FALSE)
dbWriteTable(con, name = "ms_compound", x[, colnames(x) != "synonyms"],
row.names = FALSE)
}
## Creating indices
dbExecute(con, "create index compound_id_idx on ms_compound (compound_id)")
dbExecute(con, "create index compound_name_idx on ms_compound (name)")
## Process spectra.
if (!missing(msms_spectra) && is.data.frame(msms_spectra)) {
comp_ids <- unique(x$compound_id)
if (!all(msms_spectra$compound_id %in% comp_ids))
stop("All compound identifiers in 'msms_spectra' have to be ",
"present also in 'x'")
msms_spectra <- msms_spectra[msms_spectra$compound_id %in% comp_ids, ]
.insert_msms_spectra(con, msms_spectra)
}
dbDisconnect(con)
invisible(db_file)
}
#' Add mz_min and mz_max columns to data.frame x
#'
#' @param x `data.frame` with MS MS spectrum data such as returned by
#' [msms_spectra_hmdb()] **and** eventually collapsed with
#' [.collapse_spectrum_df].
#'
#' @noRd
#'
#' @author Johannes Rainer
.add_mz_range_column <- function(x) {
if (!any(colnames(x) == "mz"))
stop("Required column 'mz' not found")
if (is.list(x$mz)) {
mzr <- do.call(rbind, lapply(x$mz, range))
x$msms_mz_range_min <- mzr[, 1]
x$msms_mz_range_max <- mzr[, 2]
x
} else {
fct <- factor(x$spectrum_id, levels = unique(x$spectrum_id))
mzs <- split(x$mz, f = fct)
mzr <- do.call(rbind, lapply(mzs, range, na.rm = TRUE))
x$msms_mz_range_min <- unsplit(mzr[, 1], fct)
x$msms_mz_range_max <- unsplit(mzr[, 2], fct)
x
}
}
#' Function to insert MS/MS spectrum data into two database tables:
#' msms_spectrum_peak with the individual m/z and intensity values and
#' msms_spectrum_metadata with the spectrum annotations. Both tables are linked
#' via the spectrum_id.
#'
#' @param con database connection
#'
#' @param x `data.frame` with the spectrum data, such as returned by
#' [msms_spectra_hmdb]
#'
#' @noRd
#'
#' @author Johannes Rainer
.insert_msms_spectra <- function(con, x) {
x <- .prepare_msms_spectra_table(x)
msms_spectrum_peak <- x$msms_spectrum_peak
x <- x$x
dbWriteTable(con, name = "msms_spectrum_peak", msms_spectrum_peak,
row.names = FALSE)
dbWriteTable(con, name = "msms_spectrum", x, row.names = FALSE)
dbExecute(
con, "create index msms_id_idx on msms_spectrum_peak (spectrum_id)")
dbExecute(con, "create index msms_pid_idx on msms_spectrum_peak (peak_id)")
dbExecute(con, "create index msms_mid_idx on msms_spectrum (spectrum_id)")
dbExecute(con, "create index msms_cid_idx on msms_spectrum (compound_id)")
}
.prepare_msms_spectra_table <- function(x) {
x <- .msms_spectrum_add_missing_columns(x)
x$collision_energy <- as.character(x$collision_energy)
.valid_msms_spectrum(x, blob = is.list(x$mz))
x <- .add_mz_range_column(x)
msms_spectrum_peak <- x[, c("spectrum_id", "mz", "intensity")]
if (is.list(msms_spectrum_peak$mz))
msms_spectrum_peak <- .expand_spectrum_df(msms_spectrum_peak)
if (!is.numeric(msms_spectrum_peak$mz) ||
!is.numeric(msms_spectrum_peak$intensity))
stop("Columns 'mz' and 'intensity' should only contain numeric values")
msms_spectrum_peak <- msms_spectrum_peak[
order(msms_spectrum_peak$spectrum_id, msms_spectrum_peak$mz), ]
msms_spectrum_peak$peak_id <- seq_len(nrow(msms_spectrum_peak))
x <- unique(x[, !(colnames(x) %in% c("mz", "intensity"))])
list(x = x, msms_spectrum_peak = msms_spectrum_peak)
}
.append_msms_spectra <- function(con, x) {
## Update spectrum_id
nsp <- dbGetQuery(con, paste0("select max(spectrum_id)",
" from msms_spectrum"))[1, 1]
if("spectrum_id" %in% colnames(x))
warning("'spectrum_id' variable in 'spectra' will be
replaced with internal indexes")
x$spectrum_id <- nsp + seq_len(nrow(x))
x <- .prepare_msms_spectra_table(x)
msms_spectrum_peak <- x$msms_spectrum_peak
x <- x$x
## Create eventual additional columns in database.
cols <- colnames(dbGetQuery(con, "select * from msms_spectrum limit 1"))
new_cols <- colnames(x)[!colnames(x) %in% cols]
dtype <- dbDataType(con, x[, new_cols, drop = FALSE])
dtype <- paste(names(dtype), dtype)
for (dt in dtype)
dbExecute(con, paste("alter table msms_spectrum add", dt))
dbAppendTable(con, "msms_spectrum", x)
## Update peak_id
nsp <- dbGetQuery(con, paste0("select max(peak_id)",
" from msms_spectrum_peak"))[1, 1]
msms_spectrum_peak$peak_id <- nsp + seq_len(nrow(msms_spectrum_peak))
dbAppendTable(
con, "msms_spectrum_peak",
msms_spectrum_peak[, c("spectrum_id", "mz", "intensity", "peak_id")])
}
.msms_spectrum_add_missing_columns <- function(x) {
cols <- names(.required_msms_spectrum_columns)
cols <- cols[!cols %in% c("spectrum_id", "compound_id", "mz", "intensity")]
cols <- cols[!cols %in% colnames(x)]
n <- nrow(x)
if (length(cols)) {
x_new <- as.data.frame(
lapply(.required_msms_spectrum_columns[cols],
as, object = rep(NA, n)))
x <- cbind(x, x_new)
}
x
}
.required_metadata_keys <- c("source", "url", "source_version", "source_date")
.required_compound_db_columns <- c("compound_id", "name", "inchi",
"inchikey", "formula", "exactmass")
.required_compound_columns <- c(.required_compound_db_columns, "synonyms")
.required_ion_columns <- c("compound_id", "ion_adduct", "ion_mz", "ion_rt")
.required_msms_spectrum_columns <- c(spectrum_id = "integer",
compound_id = "character",
polarity = "integer",
collision_energy = "character",
predicted = "logical",
splash = "character",
instrument_type = "character",
instrument = "character",
precursor_mz = "numeric",
mz = "numeric",
intensity = "numeric"
)
.required_msms_spectrum_columns_blob <- c(spectrum_id = "integer",
compound_id = "character",
polarity = "integer",
collision_energy = "character",
predicted = "logical",
splash = "character",
instrument_type = "character",
instrument = "character",
precursor_mz = "numeric",
mz = "list",
intensity = "list"
)
.valid_msms_spectrum <- function(x, blob = FALSE, error = TRUE) {
coltypes <- unlist(lapply(x, class))
msg <- NULL
if (blob)
req_cols <- .required_msms_spectrum_columns_blob
else req_cols <- .required_msms_spectrum_columns
not_found <- which(!(names(req_cols) %in% names(coltypes)))
if (length(not_found))
msg <- c(msg, paste0("Required column(s) ",
paste0("'", names(req_cols)[not_found], "'"),
" not found in 'msms_spectra'"))
common_cols <- union(names(coltypes), names(req_cols))
wrong_type <- which(coltypes[common_cols] != req_cols[common_cols])
if (length(wrong_type))
msg <- c(msg,
paste0("One or more columns contain data of the wrong type: ",
paste(names(coltypes[names(wrong_type)]),
coltypes[names(wrong_type)], sep = ":",
collapse = ", "),
". Please refer to the documentation of createCompDb ",
"for the correct data types."))
if (length(msg))
if (error) stop(msg)
else msg
else TRUE
}
#' @description Create the database file name from the metadata `data.frame`.
#' The function checks also for the presence of all required metadata fields
#' ensuring that these are also not `NA` or `NULL`.
#'
#' @noRd
.db_file_from_metadata <- function(x) {
vls <- c(x$value[x$name == "organism"], x$value[x$name == "source"],
x$value[x$name == "source_version"])
vls <- vls[!is.na(vls)]
paste0(c("CompDb", vls), collapse = ".")
}
#' @description Check the metadata `data.frame` for required columns.
#'
#' @noRd
.valid_metadata <- function(metadata, error = TRUE) {
txt <- .valid_data_frame_columns(metadata, "metadata", c("name", "value"))
if (!length(txt)) {
keys <- metadata$name
vals <- metadata$value
got_it <- .required_metadata_keys %in% keys
if (!all(got_it))
txt <- c(txt, paste0("required fields ",
paste0(.required_metadata_keys[!got_it],
collapse = ", "),
" not found in metadata data.frame"))
}
.throw_error(txt, error = error)
}
.throw_error <- function(txt = character(), error = TRUE) {
if (length(txt)) {
if (error) {
txt <- paste(txt, collapse = "\n")
stop(txt)
} else txt
} else
TRUE
}
.valid_data_frame_columns <- function(x, varName = "x",
req_cols = .required_compound_columns) {
txt <- character()
if (!is.data.frame(x))
return(paste0("'", varName, "' is supposed to be a data.frame"))
got_it <- req_cols %in% colnames(x)
if (!all(got_it))
txt <- c(txt, paste0("Miss required columns: ",
paste0(req_cols[!got_it], collapse = ", ")))
txt
}
#' @description Check that the compounds table contains all required data.
#'
#' @param db `logical(1)` whether validity should be checked on the internal
#' database table instead of the input file.
#' @md
#'
#' @noRd
.valid_compound <- function(x, error = TRUE, db = TRUE) {
if (db)
txt <- .valid_data_frame_columns(x, "x", .required_compound_db_columns)
else
txt <- .valid_data_frame_columns(x, "x", .required_compound_columns)
if (!length(txt) && !is.numeric(x$exactmass))
txt <- c(txt, "Column 'exactmass' should be numeric")
if (!length(txt) && (any(is.na(x$compound_id)) | any(x$compound_id == "")))
txt <- c(txt, "Missing values are not allowed in column 'compound_id'")
.throw_error(txt, error = error)
}
#' @description
#'
#' `createCompDbPackage` creates an R data package with the data from a
#' [`CompDb`] object.
#'
#' @importFrom Biobase createPackage
#'
#' @param version For `createCompDbPackage()`: `character(1)` with the version
#' of the package (ideally in the format `"x.y.z"`).
#'
#' @param maintainer For `createCompDbPackage()`: `character(1)` with the
#' name and email address of the package maintainer (in the form
#' `"First Last <first.last@provider.com>"`.
#'
#' @param author For `createCompDbPackage()`: `character(1)` with the name
#' of the package author.
#'
#' @param license For `createCompDbPackage()`: `character(1)` with the
#' license of the package respectively the originating provider.
#'
#' @export
#'
#' @rdname createCompDb
createCompDbPackage <- function(x, version, maintainer, author,
path = ".", license = "Artistic-2.0") {
if (missing(x) | missing(version) | missing(maintainer) | missing(author))
stop("'x', 'version', 'maintainer' and 'author' are required")
if (!is.character(x))
stop("'x' is supposed to be the file name of the CompDb")
cdb <- CompDb(x)
metad <- .metadata(cdb)
pkg_name <- .db_file_from_metadata(metad)
m_source <- .metadata_value(cdb, "source")
m_source_version <- .metadata_value(cdb, "source_version")
m_source_date <- .metadata_value(cdb, "source_date")
m_organism <- .metadata_value(cdb, "organism")
m_source_url <- .metadata_value(cdb, "url")
template_path <- system.file("pkg-template", package = "CompoundDb")
symvals <- list(
PKGTITLE = paste0(m_source, " compound annotation package"),
PKGDESCRIPTION = paste0("Exposes a CompDb compound annotation ",
"databases with annotations retrieved from ",
m_source, "."),
PKGVERSION = version,
AUTHOR = author,
MAINTAINER = maintainer,
LIC = license,
ORGANISM = m_organism,
SPECIES = m_organism,
SOURCE = m_source,
SOURCEVERSION = as.character(m_source_version),
RELEASEDATE = m_source_date,
SOURCEURL = m_source_url,
DBOBJNAME = pkg_name
)
createPackage(pkgname = pkg_name, destinationDir = path,
originDir = template_path, symbolValues = symvals)
sqlite_path <- file.path(path, pkg_name, "inst", "extdata")
dir.create(sqlite_path, showWarnings = FALSE, recursive = TRUE)
file.copy(x, to = file.path(sqlite_path, paste0(pkg_name, ".sqlite")))
invisible(file.path(path, pkg_name))
}
#' @description Simple test function to evaluate whether a file is an SDF file
#' based on its file name.
#'
#' @param x `character(1)` the file name to be tested.
#'
#' @noRd
#'
#' @md
#'
#' @author Johannes Rainer
.is_sdf_filename <- function(x) {
grepl(".sdf($|.gz$)", x, ignore.case = TRUE)
}
#' @description `make_metadata()` helps generating a metadata `data.frame` in
#' the correct format expected by the `createCompDb` function. The function
#' returns a `data.frame`.
#'
#' @param source For `make_metadata()`: `character(1)` with the name of the
#' resource that provided the compound annotation.
#'
#' @param url For `make_metadata()`: `character(1)` with the url to the original
#' resource.
#'
#' @param source_version For `make_metadata()`: `character(1)` with the version
#' of the original resource providing the annotation.
#'
#' @param source_date For `make_metadata()`: `character(1)` with the date of
#' the resource's release.
#'
#' @param organism For `make_metadata()`: `character(1)` with the name of the
#' organism. This should be in the format `"Hsapiens"` for human,
#' `"Mmusculus"` for mouse etc. Leave to `NA` if not applicable.
#'
#' @export
#'
#' @rdname createCompDb
make_metadata <- function(source = character(), url = character(),
source_version = character(),
source_date = character(),
organism = NA_character_) {
if (!length(source) || !length(url) || !length(source_version) ||
!length(source_date) || !length(organism))
stop("Arguments 'source', 'url', 'source_version', 'source_date' ",
"and 'organism' are required")
data.frame(name = c("source", "url", "source_version", "source_date",
"organism"),
value = as.character(c(source, url, source_version,
source_date, organism)),
stringsAsFactors = FALSE)
}
#' @title Import compound and spectrum information from MoNa
#'
#' @description
#'
#' `import_mona_sdf()` allows to import compound and spectrum information from
#' an SDF file from MoNa (Massbank of North America
#' http://mona.fiehnlab.ucdavis.edu/). This function is a convenience function
#' using the [compound_tbl_sdf()] and [msms_spectra_mona()] functions for data
#' import but avoiding to read the SDF files twice.
#'
#' @note
#'
#' MoNa SDF files organize the data by individual spectra (i.e. each element
#' is one spectrum) and individual compounds can not easily and consistently
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compound
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#' @param x `character(1)` being the SDF file name.
#'
#' @param nonStop `logical(1)` whether file content specific errors should
#' only reported as warnings and not break the full import process. The
#' value of this parameter is passed to parameter `skipErrors` of the
#' [read.SDFset()] function.
#'
#' @author Johannes Rainer
#'
#' @return A `list` with elements `"compound"` and `"msms_spectrum"` containing
#' data.frames with compound and MS/MS spectrum data, respectively.
#'
#' @export
#'
#' @seealso
#'
#' [compound_tbl_sdf()] to read only the compound information.
#'
#' [msms_spectra_mona()] to read only the spectrum data.
#'
#' @examples
#'
#' ## Define the test file containing a small subset from MoNa
#' fl <- system.file("sdf/MoNa_export-All_Spectra_sub.sdf.gz",
#' package = "CompoundDb")
#'
#' ## Import the data
#' res <- import_mona_sdf(fl)
import_mona_sdf <- function(x, nonStop = TRUE) {
.import_mona_sdf(x, nonStop, TRUE)
}
.import_mona_sdf <- function(x, nonStop = TRUE, compounds = TRUE) {
message("Reading SDF file ... ", appendLF = FALSE)
sdfs <- datablock2ma(datablock(read.SDFset(x, skipErrors = nonStop)))
if (!any(colnames(sdfs) == "MASS SPECTRAL PEAKS"))
stop("The provided file does not contain \"MASS SPECTRAL PEAKS\" ",
"elements. Is this an SDF file from MoNa?")
message("OK")
if (compounds) {
message("Extracting compound information ... ", appendLF = FALSE)
suppressMessages(cmps <- .simple_extract_compounds_sdf(sdfs))
message("OK")
} else cmps <- data.frame()
message("Extracting spectrum information ... ", appendLF = FALSE)
spctra <- .extract_spectra_mona_sdf(sdfs)
message("OK")
list(compound = cmps, msms_spectrum = spctra)
}
#' @importFrom RSQLite SQLITE_RW
#'
#' @export
#'
#' @rdname createCompDb
emptyCompDb <- function(dbFile = character()) {
if (file.exists(dbFile))
stop("The file \"", dbFile, "\" does already exist.")
## Initialize tables.
comps <- data.frame(compound_id = character(), name = character(),
inchi = character(), inchikey = character(),
formula = character(), exactmass = numeric(),
synonyms = character())
metad <- data.frame(name = c("source", "url", "source_version",
"source_date", "organism"),
value = NA_character_)
## source, url, source_version, source_date, organism.
createCompDb(comps, metad, path = dirname(dbFile),
dbFile = basename(dbFile))
CompDb(dbFile, flags = SQLITE_RW)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.