R/createCompDbPackage.R

Defines functions emptyCompDb .import_mona_sdf import_mona_sdf make_metadata .is_sdf_filename createCompDbPackage .valid_compound .valid_data_frame_columns .throw_error .valid_metadata .db_file_from_metadata .valid_msms_spectrum .msms_spectrum_add_missing_columns .append_msms_spectra .prepare_msms_spectra_table .insert_msms_spectra .add_mz_range_column createCompDb .import_lipidblast_json_chunk .parse_lipidblast_json_element .import_lipidblast .guess_sdf_source .simple_extract_compounds_sdf .check_parameter_file compound_tbl_lipidblast compound_tbl_sdf

Documented in compound_tbl_lipidblast compound_tbl_sdf createCompDb createCompDbPackage emptyCompDb import_mona_sdf make_metadata

#' @title Extract compound data from a file in SDF format
#'
#' @description
#'
#' `compound_tbl_sdf()` extracts basic compound annotations from a file in SDF
#' format (structure-data file). The function currently supports SDF files from:
#'
#' - HMDB (Human Metabolome Database): http://www.hmdb.ca
#' - ChEBI (Chemical Entities of Biological Interest): http://ebi.ac.uk/chebi
#' - LMSD (LIPID MAPS Structure Database): http://www.lipidmaps.org
#' - PubChem: https://pubchem.ncbi.nlm.nih.gov/
#' - MoNa: http://mona.fiehnlab.ucdavis.edu/ (see notes below!)
#'
#' @details
#'
#' Column `"name"` reports for HMDB files the `"GENERIC_NAME"`, for
#' ChEBI the `"ChEBI Name"`, for PubChem the `"PUBCHEM_IUPAC_TRADITIONAL_NAME"`,
#' and for Lipid Maps the `"COMMON_NAME"`, if that is
#' not available, the first of the compounds synonyms and, if that is also not
#' provided, the `"SYSTEMATIC_NAME"`.
#'
#' @note
#'
#' `compound_tbl_sdf()` supports also to read/process gzipped files.
#'
#' MoNa SDF files organize the data by individual spectra (i.e. each element
#' is one spectrum) and individual compounds can not easily and consistently
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compound
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the
#' field names were changed.
#'
#' @param file `character(1)` with the name of the SDF file.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#'     values in the columns `"synonyms"`. See examples for details.
#'
#' @param onlyValid `logical(1)` to import only valid or all elements (defaults
#'     to `onlyValid = TRUE`)
#'
#' @param nonStop `logical(1)` whether file content specific errors should
#'     only reported as warnings and not break the full import process. The
#'     value of this parameter is passed to parameter `skipErrors` of the
#'     [read.SDFset()] function.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#'
#' - `compound_id`: the ID of the compound.
#' - `name`: the compound's name.
#' - `inchi`: the InChI of the compound.
#' - `inchikey`: the InChI key.
#' - `formula`: the chemical formula.
#' - `exactmass`: the compound's (monoisotopic exact) mass.
#' - `synonyms`: the compound's synonyms (aliases). This type of this column is
#'   by default a `list` to support multiple aliases per compound, unless
#'   argument `collapse` is provided, in which case multiple synonyms are pasted
#'   into a single element separated by the value of `collapse`.
#' - `smiles`: the compound's SMILES (if provided).
#'
#' @family compound table creation functions
#'
#' @author Johannes Rainer and Jan Stanstrup
#'
#' @export
#'
#' @seealso [createCompDb()] for a function to create a SQLite-based compound
#'     database.
#'
#' @importFrom ChemmineR read.SDFset datablock datablock2ma validSDF
#'
#' @examples
#'
#' ## Read compound information from a subset of HMDB
#' fl <- system.file("sdf/HMDB_sub.sdf.gz", package = "CompoundDb")
#' cmps <- compound_tbl_sdf(fl)
#' cmps
#'
#' ## Column synonyms contains a list
#' cmps$synonyms
#'
#' ## If we provide the optional argument collapse, multiple entries will be
#' ## collapsed.
#' cmps <- compound_tbl_sdf(fl, collapse = "|")
#' cmps
#' cmps$synonyms
compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
                             nonStop = TRUE) {
    .check_parameter_file(file)
    suppressWarnings(
        sdf <- read.SDFset(file, skipErrors = nonStop))
    nsdf <- length(sdf)
    if (onlyValid) {
        sdf <- sdf[validSDF(sdf)]
        if (length(sdf) != nsdf)
            message("Skipped import of ", nsdf - length(sdf),
                    " invalid elements")
    }
    res <- .simple_extract_compounds_sdf(datablock2ma(datablock(sdf)))
    if (!missing(collapse)) {
        ## collapse elements from lists.
        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
                               FUN.VALUE = "character")
    }
    res
}

#' @title Extract compound data from LipidBlast
#'
#' @description
#'
#' `compound_tbl_lipidblast()` extracts basic compound annotations from a
#' LipidBlast file in (json format) downloaded from
#' http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data
#' is extracted from the json file.
#'
#' @param file `character(1)` with the name of the file name.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#'     values in the columns `"synonyms"`. See examples for details.
#'
#' @param n `integer(1)` defining the number of rows from the json file that
#'     should be read and processed at a time. By default (`n = -1L`) the
#'     complete file is imported and processed. For large json files it is
#'     suggested to set e.g. `n = 100000` to enable chunk-wise processing and
#'     hence reduce the memory demand.
#'
#' @param verbose `logical(1)` whether some progress information should be
#'     provided. Defaults to `verbose = FALSE`, but for parsing very large
#'     files (specifically with chunk-wise processing enabled with `n` > 0)
#'     it might be helpful to set to `verbose = TRUE`.
#'
#' @param BPPARAM `BiocParallelParam` object to configure parallel processing.
#'     Defaults to `bpparam()`.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#'
#' - `compound_id`: the ID of the compound.
#' - `name`: the compound's name.
#' - `inchi`: the InChI of the compound.
#' - `inchikey`: the InChI key.
#' - `smiles`: the SMILES representation of the compound.
#' - `formula`: the chemical formula.
#' - `exactmass`: the compound's mass.
#' - `compound_class`: the class of the compound.
#' - `ionization_mode`: the ionization mode.
#' - `precursor_mz`: the precursor m/z value.
#' - `precursor_type`: the precursor type.
#' - `retention_time`: the retention time.
#' - `ccs`: the collision cross-section.
#' - `spectrum`: the spectrum data (i.e. the mass peaks, as a concatenated
#'   character string).
#' - `synonyms`: the compound's synonyms (aliases). This type of this column is
#'   by default a `list` to support multiple aliases per compound, unless
#'   argument `collapse` is provided, in which case multiple synonyms are pasted
#'   into a single element separated by the value of `collapse`.
#'
#' @family compound table creation functions
#'
#' @author Johannes Rainer, Jan Stanstrup and Prateek Arora
#'
#' @export
#'
#' @importFrom BiocParallel bpparam
#'
#' @examples
#'
#' ## Read compound information from a subset of HMDB
#' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb")
#' cmps <- compound_tbl_lipidblast(fl, n = 50000, verbose = TRUE)
#' cmps
compound_tbl_lipidblast <- function(file, collapse = character(), n = -1L,
                                    verbose = FALSE, BPPARAM = bpparam()) {
    .check_parameter_file(file)
    res <- .import_lipidblast(file, n = n, verbose = verbose, BPPARAM = BPPARAM)
    if (length(collapse)) {
        ## collapse elements from lists.
        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse[1L],
                                FUN.VALUE = "character")
    }
    res
}

.check_parameter_file <- function(file) {
    if (missing(file))
        stop("Please provide the file name using 'file'")
    if (!file.exists(file))
        stop("Can not find file ", file)
}

#' @description
#'
#' Internal function to extract compound information from a file in SDF format.
#'
#' @param x what is returned by datablock2ma(datablock(read.SDFset)).
#'
#' @return A [tibble::tibble] with columns `"compound_id"`, `"name"`,
#'     `"inchi"`, `"formula"`, `"exactmass"`.
#'
#' @note
#'
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the
#' field names were changed.
#'
#' @importFrom tibble tibble
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @noRd
.simple_extract_compounds_sdf <- function(x) {
    source_db <- .guess_sdf_source(colnames(x))
    if (is.null(source_db))
        stop("The SDF file is not supported. Supported are SDF files from ",
             "HMDB, ChEBI and LipidMaps.")
    colmap <- get(paste0(".", source_db, "_colmap"))
    sep <- get(paste0(".", source_db, "_separator"))
    ## Fix missing COMMON_NAME entries in LipidMaps (see issue #1)
    nms <- x[, colmap["name"]]
    syns <- strsplit(x[, colmap["synonyms"]], split = sep)
    if (source_db == "lipidmaps") {
        nas <- is.na(nms)
        if (any(nas))
            nms[nas] <- vapply(syns[nas], `[[`, 1, FUN.VALUE = "character")
        nas <- is.na(nms)
        if (any(nas))
            nms[nas] <- x[nas, "SYSTEMATIC_NAME"]
    }
    res <- tibble(compound_id = x[, colmap["id"]],
                      name = nms,
                      inchi = x[, colmap["inchi"]],
                      inchikey = x[, colmap["inchikey"]],
                      formula = x[, colmap["formula"]],
                      exactmass = as.numeric(x[, colmap["exactmass"]]),
                      synonyms = syns
                      )
    if (is.na(colmap["smiles"])) {
        res$smiles <- NA_character_
    } else
        res$smiles <- x[, colmap["smiles"]]
    if (source_db == "mona") {
        warning("MoNa data can currently not be normalized and the ",
                "compound table contains thus highly redundant data.",
                call. = FALSE)
        ## Eventually reduce and normalize.
        inchis <- .extract_field_from_string(x[, "COMMENT"], "InChI")
        nona <- !is.na(inchis)
        inchis[nona] <- paste0("InChI", inchis[nona])
        res$inchi <- inchis
        res$compound_id <- .compound_id_from_mona_sdf(x)
        res
    } else res
}

#' @description
#'
#' Based on the provided `colnames` guess whether the file is from HMDB,
#' ChEBI, LIPID MAPS, PubChem or LipidBlast.
#'
#'
#' @param x `character` with the column names of the data table.
#'
#' @return `character(1)` with the name of the resource or `NULL`.
#'
#' @note
#'
#' LIPID MAPS was tested August 2020. Older SDF files might not work as the
#' field names were changed.
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @noRd
.guess_sdf_source <- function(x) {
    if (any(x == "ChEBI ID"))
        return("chebi")
    if (any(x == "LM_ID"))
        return("lipidmaps")
    if (any(x == "PUBCHEM_COMPOUND_CID"))
        return("pubchem")
    if (any(x == "HMDB_ID"))
        return("hmdb")
    if (any(x == "ID")) {
        message("Guessing we've got a SDF file from MoNa.")
        return("mona")
    }
    NULL
}


.hmdb_colmap <- c(id = "HMDB_ID",
                  name = "GENERIC_NAME",
                  inchi = "INCHI_IDENTIFIER",
                  inchikey = "INCHI_KEY",
                  formula = "FORMULA",
                  exactmass = "EXACT_MASS",
                  synonyms = "SYNONYMS",
                  smiles = "SMILES"
                  )
.hmdb_separator <- "; "
.chebi_colmap <- c(id = "ChEBI ID",
                   name = "ChEBI Name",
                   inchi = "InChI",
                   inchikey = "InChIKey",
                   formula = "Formulae",
                   exactmass = "Monoisotopic Mass",
                   synonyms = "Synonyms",
                   smiles = "SMILES"
                   )
.chebi_separator <- " __ "
.lipidmaps_colmap <- c(id = "LM_ID",
                       name = "NAME",
                       inchi = "INCHI",
                       inchikey = "INCHI_KEY",
                       formula = "FORMULA",
                       exactmass = "EXACT_MASS",
                       synonyms = "SYNONYMS",
                       smiles = NA
                       )
.lipidmaps_separator <- "; "
.pubchem_colmap <- c(id = "PUBCHEM_COMPOUND_CID",
                     name = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
                     inchi = "PUBCHEM_IUPAC_INCHI",
                     inchikey = "PUBCHEM_IUPAC_INCHIKEY",
                     formula = "PUBCHEM_MOLECULAR_FORMULA",
                     exactmass = "PUBCHEM_EXACT_MASS",
                     synonyms = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
                     smiles = "PUBCHEM_OPENEYE_CAN_SMILES"
                       # Others:
                                        # PUBCHEM_IUPAC_SYSTEMATIC_NAME,
                                        # PUBCHEM_IUPAC_CAS_NAME,
                                        # PUBCHEM_IUPAC_OPENEYE_NAME,
                                        # PUBCHEM_IUPAC_NAME
                     )
.pubchem_separator <- "; " # there seems to be none
.mona_colmap <- c(id = "ID",
                  name = "NAME",
                  inchi = "INCHIKEY",
                  inchikey = "INCHIKEY",
                  formula = "FORMULA",
                  exactmass = "EXACT MASS",
                  synonyms = "SYNONYMS",
                  smiles = NA
                  )
.mona_separator <- " __ "

#' @description Import compound information from a LipidBlast file in json
#'    format.
#'
#' @note This is a modified version from Jan's generate_lipidblast_tbl that
#'     extracts the mass also from the json and does not calculate it.
#'
#' @author Jan Stanstrup, Johannes Rainer and Prateek Arora
#'
#' @importFrom jsonlite read_json
#'
#' @importFrom dplyr bind_rows
#' @importFrom BiocParallel bplapply bpparam
#'
#' @md
#' @noRd
.import_lipidblast <- function(file, n = -1L, verbose = FALSE,
                               BPPARAM = bpparam()) {
    if (n < 0) {
        lipidb <- read_json(file)
        if (verbose)
            message("Processing ", length(lipidb), " elements ...",
                    appendLF = FALSE)

        ## Use BiocParallel to process elements in parallel
        res <- bplapply(lipidb, .parse_lipidblast_json_element,
                        BPPARAM = BPPARAM)

        if (verbose) message(" done.")
    } else {
        res <- .import_lipidblast_json_chunk(file, n = n, verbose = verbose,
                                             BPPARAM = BPPARAM)
    }
    bind_rows(res)
}

#' @description Parse a single LipidBlast JSON element.
#'
#' @param x A JSON element representing a compound.
#' @return A list containing parsed compound information.
#' @noRd
.parse_lipidblast_json_element <- function(x) {
    id <- x$id
    cmp <- x$compound[[1]]

    ## Helper function to extract metadata from multiple sources
    get_metadata <- function(name, sources) {
        for (source in sources) {
            value <- unlist(lapply(source, function(z) {
                if (z$name == name) z$value
            }))
            if (length(value) > 0 && !all(value == "")) {
                return(value[1])
            }
        }
        return(NA_character_)
    }

    ## Define metadata sources
    metadata_sources <- list(x$metaData, cmp$metaData)

    ## Get names
    nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
    snms <- if (length(nms) > 1L) nms[-1L] else NA_character_

    ## Extract metadata
    mass <- get_metadata("total exact mass", metadata_sources)
    mass <- as.numeric(mass)
    if (is.na(mass)) mass <- NA_real_

    frml <- get_metadata("molecular formula", metadata_sources)
    inchi <- get_metadata("InChI", metadata_sources)
    inchikey <- get_metadata("InChIKey", metadata_sources)
    smiles <- get_metadata("SMILES", metadata_sources)
    compound_class <- get_metadata("compound class", metadata_sources)
    ionization_mode <- get_metadata("ionization mode", metadata_sources)
    precursor_mz <- as.numeric(get_metadata("precursor m/z", metadata_sources))
    precursor_type <- get_metadata("precursor type", metadata_sources)
    retention_time <- as.numeric(get_metadata("retention time", metadata_sources))
    ccs <- as.numeric(get_metadata("ccs", metadata_sources))

    ## Extract spectrum data
    spectrum <- x$spectrum
    if (is.null(spectrum)) spectrum <- NA_character_

    list(
        compound_id = id,
        name = nms[1],
        inchi = inchi,
        inchikey = inchikey,
        smiles = smiles,
        formula = frml,
        exactmass = mass,
        compound_class = compound_class,
        ionization_mode = ionization_mode,
        precursor_mz = precursor_mz,
        precursor_type = precursor_type,
        retention_time = retention_time,
        ccs = ccs,
        spectrum = spectrum,
        synonyms = list(snms)
    )
}

#' @description Import compound information from a chunk of a LipidBlast file in JSON format.
#'
#' @param x The file path to the LipidBlast JSON file.
#' @param n The number of lines to read in each chunk.
#' @param verbose Logical, if TRUE, progress messages are printed.
#' @param BPPARAM A BiocParallelParam object specifying the parallel backend.
#' @return A list of parsed compound information.
#' @importFrom jsonlite fromJSON
#' @importFrom dplyr bind_rows
#' @importFrom BiocParallel bplapply bpparam
#' @noRd
.import_lipidblast_json_chunk <- function(x, n = 10000, verbose = FALSE,
                                          BPPARAM = bpparam()) {
    con <- file(x, open = "r")
    on.exit(close(con))
    res <- list()
    while (length(ls <- readLines(con, n = n, warn = FALSE))) {
        if (length(grep("^\\[", ls[1L])))
            ls <- ls[-1L]
        if (length(grep("^\\]", ls[length(ls)])))
            ls <- ls[-length(ls)]
        ls <- sub(",$", "", ls)
        if (length(ls)) {
            chunk_res <- bplapply(ls, function(z) {
                .parse_lipidblast_json_element(
                    jsonlite::fromJSON(z, simplifyVector = FALSE))
            }, BPPARAM = BPPARAM)
            res <- c(res, chunk_res)
        }
        if (verbose)
            message("Processed ", length(ls), " elements")
    }
    res
}

#' @title Create a CompDb database
#'
#' @description
#'
#' `CompDb` databases can be created with the `createCompDb()` or the
#' `emptyCompDb()` functions, the former creating and initializing (filling)
#' the database with existing data, the latter creating an empty database that
#' can be subsequently filled with [insertCompound()] or [insertSpectra()]
#' calls.
#'
#' `emptyCompDb()` requires only the file name of the database that should be
#' created as input and returns a `CompDb` representing the empty database.
#'
#' `createCompDb()` creates a `SQLite`-based [`CompDb`] object/database
#' from a compound resource provided as a `data.frame` or `tbl`. Alternatively,
#' the name(s) of the file(s) from which the annotation should be extracted can
#' be provided. Supported are SDF files (such as those from the
#' *Human Metabolome Database* HMDB) that can be read using the
#' [compound_tbl_sdf()] or LipidBlast files (see [compound_tbl_lipidblast()].
#'
#' An additional `data.frame` providing metadata information including the data
#' source, date, version and organism is mandatory. By default, the function
#' will define the name of the database based on the provided metadata, but it
#' is also possible to define this manually with the `dbFile` parameter.
#'
#' Optionally MS/MS (MS2) spectra for compounds can be also stored in the
#' database. Currently only MS/MS spectra from HMDB are supported. These can
#' be downloaded in XML format from HMDB (http://www.hmdb.ca), loaded with
#' the [msms_spectra_hmdb()] or [msms_spectra_mona()] function and passed to
#' the function with the `msms_spectra` argument. See [msms_spectra_hmdb()] or
#' [msms_spectra_mona()] for information on the expected columns and format.
#'
#' Required columns for the `data.frame` providing the compound information (
#' parameter `x`) are:
#'
#' - `"compound_id"`: the ID of the compound. Can be an `integer` or
#'   `character`. Duplicated IDs are supported (for compatibility reasons), but
#'   not suggested. No missing values allowed.
#' - `"name"`: the compound's name.
#' - `"inchi"`: the InChI of the compound.
#' - `"inchikey"`: the InChI key.
#' - `"formula"`: the chemical formula.
#' - `"exactmass"`: the compound's (exact) mass.
#' - `"synonyms"`: additional synonyms/aliases for the compound. Should be
#'   either a single character or a list of values for each compound.
#'
#' Any additional columns in the provided `data.frame` (such as e.g. `"smiles"`
#' providing the compound's SMILES) are also supported and will be inserted into
#' the database table.
#'
#' See e.g. [compound_tbl_sdf()] or [compound_tbl_lipidblast()] for functions
#' creating such compound tables.
#'
#' The table containing the MS2 spectra data should have the following format
#' and columns:
#'
#' - `"spectrum_id"`: an arbitrary ID for the spectrum. Has to be an `integer`.
#' - `"compound_id"`: the ID of the compound to which the spectrum can be
#'   associated with. This has to be present in the `data.frame` defining the
#'   compounds.
#' - `"polarity"`: the polarity (as an `integer`, `0` for negative, `1` for
#'   positive, `NA` for not set).
#' - `"collision_energy"`: the collision energy.
#' - `"predicted"`: whether the spectrum was predicted or measured.
#' - `"splash"`: the SPLASH of the spectrum.
#' - `"instrument_type"`: the instrument type.
#' - `"instrument"`: the name of the instrument.
#' - `"precursor_mz"`: the precursor m/z (as a `numeric`).
#' - `"mz"`: the m/z values.
#' - `"intensity"`: the intensity values.
#'
#' Only for columns `"spectrum_id"`, `"compound_id"`, `"mz"` and `"intensity"`
#' a value has to be provided in each row of the `data.frame`. The others are
#' optional. Note that the `data.frame` can be either in the format as in the
#' example below (i.e. each row being one spectrum and columns `"mz"` and
#' `"intensity"` being of type `list` each element being the m/z or intensity
#' values of one spectrum) or in a *full* form, in which each row represents
#' one *peak* and all columns except `"mz"` and `"intensity"` containing
#' redundant information of each spectrum (hence columns `"mz"` and
#' `"intensity"` being of type `numeric`).
#'
#' The metadata `data.frame` is supposed to have two columns named `"name"` and
#' `"value"` providing the following minimal information as key-value pairs
#' (see `make_metadata` for a utility function to create such a `data.frame`):
#'
#' - `"source"`: the source from which the data was retrieved (e.g. `"HMDB"`).
#' - `"url"`: the url from which the original data was retrieved.
#' - `"source_version"`: the version from the original data source
#'   (e.g. `"v4"`).
#' - `"source_date"`: the date when the original data source was generated.
#' - `"organism"`: the organism. Should be in the form `"Hsapiens"` or
#'   `"Mmusculus"`.
#'
#' @details
#'
#' Metadata information is also used to create the file name for the database
#' file. The name starts with `"CompDb"`, followed by the organism, the
#' data source and its version. A compound database file for HMDB version 4
#' with human metabolites will thus be named: `"CompDb.Hsapiens.HMDB.v4"`.
#'
#' A single `CompDb` database is created from multiple SDF files (e.g. for
#' *PubChem*) if all the file names are provided with parameter `x`. Parallel
#' processing is currently not enabled because SQLite does not support it yet
#' natively.
#'
#' @param x For `createCompDb()`: `data.frame` or `tbl` with the compound
#'     annotations or `character` with the file name(s) from which the compound
#'     annotations should be retrieved. See description for details.
#'
#'     For `createCompDbPackage()`: `character(1)` with the file name of the
#'     `CompDb` SQLite file (created by `createCompDb`).
#'
#' @param metadata For `createCompDb()`: `data.frame` with metadata
#'     information. See description for details.
#'
#' @param msms_spectra For `createCompDb()`: `data.frame` with MS/MS spectrum
#'     data. See [msms_spectra_hmdb()] for the expected format and a function
#'     to import such data from spectrum xml files from HMDB.
#'
#' @param path `character(1)` with the path to the directory where the database
#'     file or package folder should be written. Defaults to the current
#'     directory.
#'
#' @param dbFile `character(1)` to optionally provide the name of the SQLite
#'     database file. If not provided (the default) the database name is defined
#'     using information from the provided `metadata`.
#'
#' @return For `createCompDb()`: a `character(1)` with the database name
#'     (invisibly).
#'
#' @importFrom DBI dbDriver dbWriteTable dbExecute dbDisconnect
#' @importFrom RSQLite dbConnect
#' @importFrom dplyr bind_cols bind_rows
#' @importFrom MsCoreUtils rbindFill
#'
#' @export
#'
#' @author Johannes Rainer
#'
#' @seealso
#'
#' [compound_tbl_sdf()] and [compound_tbl_lipidblast()] for functions
#' to extract compound annotations from files in SDF format, or files from
#' LipidBlast.
#'
#' [import_mona_sdf()] to import both the compound and spectrum data from a
#' SDF file from MoNa (Massbank of North America) in one call.
#'
#' [msms_spectra_hmdb()] and [msms_spectra_mona()] for functions to import
#' MS/MS spectrum data from xml files from HMDB or an SDF file from MoNa.
#'
#' [CompDb()] for how to use a compound database.
#'
#' @examples
#'
#' ## Read compounds for a HMDB subset
#' fl <- system.file("sdf/HMDB_sub.sdf.gz", package = "CompoundDb")
#' cmps <- compound_tbl_sdf(fl)
#'
#' ## Create a metadata data.frame for the compounds.
#' metad <- data.frame(name = c("source", "url", "source_version",
#'     "source_date", "organism"), value = c("HMDB", "http://www.hmdb.ca",
#'     "v4", "2017-08-27", "Hsapiens"))
#'
#' ## Alternatively use the make_metadata helper function
#' metad <- make_metadata(source = "HMDB", source_version = "v4",
#'     source_date = "2017-08", organism = "Hsapiens",
#'     url = "http://www.hmdb.ca")
#' ## Create a SQLite database in the temporary folder
#' db_f <- createCompDb(cmps, metadata = metad, path = tempdir())
#'
#' ## The database can be loaded and accessed with a CompDb object
#' db <- CompDb(db_f)
#' db
#'
#' ## Create a database for HMDB that includes also MS/MS spectrum data
#' metad2 <- make_metadata(source = "HMDB_with_spectra", source_version = "v4",
#'     source_date = "2017-08", organism = "Hsapiens",
#'     url = "http://www.hmdb.ca")
#'
#' ## Import spectrum information from selected MS/MS xml files from HMDB
#' ## that are provided in the package
#' xml_path <- system.file("xml", package = "CompoundDb")
#' spctra <- msms_spectra_hmdb(xml_path)
#'
#' ## Create a SQLite database in the temporary folder
#' db_f2 <- createCompDb(cmps, metadata = metad2, msms_spectra = spctra,
#'     path = tempdir())
#'
#' ## The database can be loaded and accessed with a CompDb object
#' db2 <- CompDb(db_f2)
#' db2
#'
#' ## Does the database contain MS/MS spectrum data?
#' hasMsMsSpectra(db2)
#'
#' ## Create a database for a ChEBI subset providing the file name of the
#' ## corresponding SDF file
#' metad <- make_metadata(source = "ChEBI_sub", source_version = "2",
#'     source_date = NA, organism = "Hsapiens", url = "www.ebi.ac.uk/chebi")
#' db_f <- createCompDb(system.file("sdf/ChEBI_sub.sdf.gz",
#'     package = "CompoundDb"), metadata = metad, path = tempdir())
#' db <- CompDb(db_f)
#' db
#'
#' compounds(db)
#'
#' ## connect to the database and query it's tables using RSQlite
#' library(RSQLite)
#' con <- dbConnect(dbDriver("SQLite"), db_f)
#'
#' dbGetQuery(con, "select * from metadata")
#' dbGetQuery(con, "select * from ms_compound")
#'
#' ## To create a CompDb R-package we could simply use the
#' ## createCompDbPackage function on the SQLite database file name.
createCompDb <- function(x, metadata, msms_spectra, path = ".",
                         dbFile = character()) {
    .valid_metadata(metadata)
    if (!length(dbFile))
        db_file <- file.path(path, paste0(.db_file_from_metadata(metadata),
                                          ".sqlite"))
    else db_file <- file.path(path, dbFile)
    con <- dbConnect(dbDriver("SQLite"), dbname = db_file)
    ## Add additional metadata info
    metadata$name <- as.character(metadata$name)
    metadata$value <- as.character(metadata$value)
    metadata <- bind_rows(metadata, c(name = "db_creation_date",
                                      value = date()))
    metadata <- bind_rows(metadata, c(name = "supporting_package",
                                      value = "CompoundDb"))
    metadata <- bind_rows(metadata, c(name = "supporting_object",
                                      value = "CompDb"))
    dbWriteTable(con, name = "metadata", metadata, row.names = FALSE)
    ## ms_compound table
    if (is.character(x)) {
        tbls <- snms <- vector("list", length(x))
        for (i in seq_along(x)) {
            z <- x[i]
            message("Import data from ", basename(z), " ...", appendLF = FALSE)
            if (.is_sdf_filename(z)) {
                tbl <- compound_tbl_sdf(z)
            } else {
                ## Assume this is a json file... might simply be wrong.
                tbl <- compound_tbl_lipidblast(z)
            }
            message("OK")
            .valid_compound(tbl, db = FALSE)
            snms[[i]] <- bind_cols(compound_id = rep(tbl$compound_id,
                                                     lengths(tbl$synonyms)),
                                   synonym = unlist(tbl$synonyms))
            tbls[[i]] <- tbl[, colnames(tbl) != "synonyms"]
        }
        tbls <- do.call(rbindFill, tbls)
        snms <- do.call(rbindFill, snms)
        dbWriteTable(con, name = "synonym", snms, row.names = FALSE,
                     append = TRUE)
        dbWriteTable(con, name = "ms_compound", row.names = FALSE,
                     tbls, append = TRUE)

    } else {
        .valid_compound(x, db = FALSE)
        x_synonym <- bind_cols(compound_id = rep(x$compound_id,
                                                 lengths(x$synonyms)),
                               synonym = unlist(x$synonyms))
        dbWriteTable(con, name = "synonym", x_synonym, row.names = FALSE)
        dbWriteTable(con, name = "ms_compound", x[, colnames(x) != "synonyms"],
                     row.names = FALSE)
    }
    ## Creating indices
    dbExecute(con, "create index compound_id_idx on ms_compound (compound_id)")
    dbExecute(con, "create index compound_name_idx on ms_compound (name)")
    ## Process spectra.
    if (!missing(msms_spectra) && is.data.frame(msms_spectra)) {
        comp_ids <- unique(x$compound_id)
        if (!all(msms_spectra$compound_id %in% comp_ids))
            stop("All compound identifiers in 'msms_spectra' have to be ",
                 "present also in 'x'")
        msms_spectra <- msms_spectra[msms_spectra$compound_id %in% comp_ids, ]
        .insert_msms_spectra(con, msms_spectra)
    }
    dbDisconnect(con)
    invisible(db_file)
}

#' Add mz_min and mz_max columns to data.frame x
#'
#' @param x `data.frame` with MS MS spectrum data such as returned by
#'     [msms_spectra_hmdb()] **and** eventually collapsed with
#'     [.collapse_spectrum_df].
#'
#' @noRd
#'
#' @author Johannes Rainer
.add_mz_range_column <- function(x) {
    if (!any(colnames(x) == "mz"))
        stop("Required column 'mz' not found")
    if (is.list(x$mz)) {
        mzr <- do.call(rbind, lapply(x$mz, range))
        x$msms_mz_range_min <- mzr[, 1]
        x$msms_mz_range_max <- mzr[, 2]
        x
    } else {
        fct <- factor(x$spectrum_id, levels = unique(x$spectrum_id))
        mzs <- split(x$mz, f = fct)
        mzr <- do.call(rbind, lapply(mzs, range, na.rm = TRUE))
        x$msms_mz_range_min <- unsplit(mzr[, 1], fct)
        x$msms_mz_range_max <- unsplit(mzr[, 2], fct)
        x
    }
}

#' Function to insert MS/MS spectrum data into two database tables:
#' msms_spectrum_peak with the individual m/z and intensity values and
#' msms_spectrum_metadata with the spectrum annotations. Both tables are linked
#' via the spectrum_id.
#'
#' @param con database connection
#'
#' @param x `data.frame` with the spectrum data, such as returned by
#' [msms_spectra_hmdb]
#'
#' @noRd
#'
#' @author Johannes Rainer
.insert_msms_spectra <- function(con, x) {
    x <- .prepare_msms_spectra_table(x)
    msms_spectrum_peak <- x$msms_spectrum_peak
    x <- x$x
    dbWriteTable(con, name = "msms_spectrum_peak", msms_spectrum_peak,
                 row.names = FALSE)
    dbWriteTable(con, name = "msms_spectrum", x, row.names = FALSE)
    dbExecute(
        con, "create index msms_id_idx on msms_spectrum_peak (spectrum_id)")
    dbExecute(con, "create index msms_pid_idx on msms_spectrum_peak (peak_id)")
    dbExecute(con, "create index msms_mid_idx on msms_spectrum (spectrum_id)")
    dbExecute(con, "create index msms_cid_idx on msms_spectrum (compound_id)")
}

.prepare_msms_spectra_table <- function(x) {
    x <- .msms_spectrum_add_missing_columns(x)
    x$collision_energy <- as.character(x$collision_energy)
    .valid_msms_spectrum(x, blob = is.list(x$mz))
    x <- .add_mz_range_column(x)
    msms_spectrum_peak <- x[, c("spectrum_id", "mz", "intensity")]
    if (is.list(msms_spectrum_peak$mz))
        msms_spectrum_peak <- .expand_spectrum_df(msms_spectrum_peak)
    if (!is.numeric(msms_spectrum_peak$mz) ||
        !is.numeric(msms_spectrum_peak$intensity))
        stop("Columns 'mz' and 'intensity' should only contain numeric values")
    msms_spectrum_peak <- msms_spectrum_peak[
        order(msms_spectrum_peak$spectrum_id, msms_spectrum_peak$mz), ]
    msms_spectrum_peak$peak_id <- seq_len(nrow(msms_spectrum_peak))
    x <- unique(x[, !(colnames(x) %in% c("mz", "intensity"))])
    list(x = x, msms_spectrum_peak = msms_spectrum_peak)
}


.append_msms_spectra <- function(con, x) {
    ## Update spectrum_id
    nsp <- dbGetQuery(con, paste0("select max(spectrum_id)",
                                  " from msms_spectrum"))[1, 1]
    if("spectrum_id" %in% colnames(x))
        warning("'spectrum_id' variable in 'spectra' will be
                              replaced with internal indexes")
    x$spectrum_id <- nsp + seq_len(nrow(x))
    x <- .prepare_msms_spectra_table(x)
    msms_spectrum_peak <- x$msms_spectrum_peak
    x <- x$x
    ## Create eventual additional columns in database.
    cols <- colnames(dbGetQuery(con, "select * from msms_spectrum limit 1"))
    new_cols <- colnames(x)[!colnames(x) %in% cols]
    dtype <- dbDataType(con, x[, new_cols, drop = FALSE])
    dtype <- paste(names(dtype), dtype)
    for (dt in dtype)
        dbExecute(con, paste("alter table msms_spectrum add", dt))
    dbAppendTable(con, "msms_spectrum", x)

    ## Update peak_id
    nsp <- dbGetQuery(con, paste0("select max(peak_id)",
                                  " from msms_spectrum_peak"))[1, 1]
    msms_spectrum_peak$peak_id <- nsp + seq_len(nrow(msms_spectrum_peak))
    dbAppendTable(
        con, "msms_spectrum_peak",
        msms_spectrum_peak[, c("spectrum_id", "mz", "intensity", "peak_id")])
}

.msms_spectrum_add_missing_columns <- function(x) {
    cols <- names(.required_msms_spectrum_columns)
    cols <- cols[!cols %in% c("spectrum_id", "compound_id", "mz", "intensity")]
    cols <- cols[!cols %in% colnames(x)]
    n <- nrow(x)
    if (length(cols)) {
        x_new <- as.data.frame(
            lapply(.required_msms_spectrum_columns[cols],
                   as, object = rep(NA, n)))
        x <- cbind(x, x_new)
    }
    x
}

.required_metadata_keys <- c("source", "url", "source_version", "source_date")
.required_compound_db_columns <- c("compound_id", "name", "inchi",
                                   "inchikey", "formula", "exactmass")
.required_compound_columns <- c(.required_compound_db_columns, "synonyms")
.required_ion_columns <- c("compound_id", "ion_adduct", "ion_mz", "ion_rt")



.required_msms_spectrum_columns <- c(spectrum_id = "integer",
                                     compound_id = "character",
                                     polarity = "integer",
                                     collision_energy = "character",
                                     predicted = "logical",
                                     splash = "character",
                                     instrument_type = "character",
                                     instrument = "character",
                                     precursor_mz = "numeric",
                                     mz = "numeric",
                                     intensity = "numeric"
                                     )

.required_msms_spectrum_columns_blob <- c(spectrum_id = "integer",
                                          compound_id = "character",
                                          polarity = "integer",
                                          collision_energy = "character",
                                          predicted = "logical",
                                          splash = "character",
                                          instrument_type = "character",
                                          instrument = "character",
                                          precursor_mz = "numeric",
                                          mz = "list",
                                          intensity = "list"
                                          )

.valid_msms_spectrum <- function(x, blob = FALSE, error = TRUE) {
    coltypes <- unlist(lapply(x, class))
    msg <- NULL
    if (blob)
        req_cols <- .required_msms_spectrum_columns_blob
    else req_cols <- .required_msms_spectrum_columns
    not_found <- which(!(names(req_cols) %in% names(coltypes)))
    if (length(not_found))
        msg <- c(msg, paste0("Required column(s) ",
                             paste0("'", names(req_cols)[not_found], "'"),
                             " not found in 'msms_spectra'"))
    common_cols <- union(names(coltypes), names(req_cols))
    wrong_type <- which(coltypes[common_cols] != req_cols[common_cols])
    if (length(wrong_type))
        msg <- c(msg,
                 paste0("One or more columns contain data of the wrong type: ",
                        paste(names(coltypes[names(wrong_type)]),
                              coltypes[names(wrong_type)], sep = ":",
                              collapse = ", "),
                        ". Please refer to the documentation of createCompDb ",
                        "for the correct data types."))
    if (length(msg))
        if (error) stop(msg)
        else msg
    else TRUE
}

#' @description Create the database file name from the metadata `data.frame`.
#'     The function checks also for the presence of all required metadata fields
#'     ensuring that these are also not `NA` or `NULL`.
#'
#' @noRd
.db_file_from_metadata <- function(x) {
    vls <- c(x$value[x$name == "organism"], x$value[x$name == "source"],
             x$value[x$name == "source_version"])
    vls <- vls[!is.na(vls)]
    paste0(c("CompDb", vls), collapse = ".")
}

#' @description Check the metadata `data.frame` for required columns.
#'
#' @noRd
.valid_metadata <- function(metadata, error = TRUE) {
    txt <- .valid_data_frame_columns(metadata, "metadata", c("name", "value"))
    if (!length(txt)) {
        keys <- metadata$name
        vals <- metadata$value
        got_it <- .required_metadata_keys %in% keys
        if (!all(got_it))
            txt <- c(txt, paste0("required fields ",
                                 paste0(.required_metadata_keys[!got_it],
                                        collapse = ", "),
                                 " not found in metadata data.frame"))
    }
    .throw_error(txt, error = error)
}

.throw_error <- function(txt = character(), error = TRUE) {
    if (length(txt)) {
        if (error) {
            txt <- paste(txt, collapse = "\n")
            stop(txt)
        } else txt
    } else
        TRUE
}

.valid_data_frame_columns <- function(x, varName = "x",
                                      req_cols = .required_compound_columns) {
    txt <- character()
    if (!is.data.frame(x))
        return(paste0("'", varName, "' is supposed to be a data.frame"))
    got_it <- req_cols %in% colnames(x)
    if (!all(got_it))
        txt <- c(txt, paste0("Miss required columns: ",
                             paste0(req_cols[!got_it], collapse = ", ")))
    txt
}

#' @description Check that the compounds table contains all required data.
#'
#' @param db `logical(1)` whether validity should be checked on the internal
#'     database table instead of the input file.
#' @md
#'
#' @noRd
.valid_compound <- function(x, error = TRUE, db = TRUE) {
    if (db)
        txt <- .valid_data_frame_columns(x, "x", .required_compound_db_columns)
    else
        txt <- .valid_data_frame_columns(x, "x", .required_compound_columns)
    if (!length(txt) && !is.numeric(x$exactmass))
        txt <- c(txt, "Column 'exactmass' should be numeric")
    if (!length(txt) && (any(is.na(x$compound_id)) | any(x$compound_id == "")))
        txt <- c(txt, "Missing values are not allowed in column 'compound_id'")
    .throw_error(txt, error = error)
}

#' @description
#'
#' `createCompDbPackage` creates an R data package with the data from a
#' [`CompDb`] object.
#'
#' @importFrom Biobase createPackage
#'
#' @param version For `createCompDbPackage()`: `character(1)` with the version
#'     of the package (ideally in the format `"x.y.z"`).
#'
#' @param maintainer For `createCompDbPackage()`: `character(1)` with the
#'     name and email address of the package maintainer (in the form
#'     `"First Last <first.last@provider.com>"`.
#'
#' @param author For `createCompDbPackage()`: `character(1)` with the name
#'     of the package author.
#'
#' @param license For `createCompDbPackage()`: `character(1)` with the
#'     license of the package respectively the originating provider.
#'
#' @export
#'
#' @rdname createCompDb
createCompDbPackage <- function(x, version, maintainer, author,
                                    path = ".", license = "Artistic-2.0") {
    if (missing(x) | missing(version) | missing(maintainer) | missing(author))
        stop("'x', 'version', 'maintainer' and 'author' are required")
    if (!is.character(x))
        stop("'x' is supposed to be the file name of the CompDb")
    cdb <- CompDb(x)
    metad <- .metadata(cdb)
    pkg_name <- .db_file_from_metadata(metad)
    m_source <- .metadata_value(cdb, "source")
    m_source_version <- .metadata_value(cdb, "source_version")
    m_source_date <- .metadata_value(cdb, "source_date")
    m_organism <- .metadata_value(cdb, "organism")
    m_source_url <- .metadata_value(cdb, "url")
    template_path <- system.file("pkg-template", package = "CompoundDb")
    symvals <- list(
        PKGTITLE = paste0(m_source, " compound annotation package"),
        PKGDESCRIPTION = paste0("Exposes a CompDb compound annotation ",
                                "databases with annotations retrieved from ",
                                m_source, "."),
        PKGVERSION  = version,
        AUTHOR = author,
        MAINTAINER = maintainer,
        LIC = license,
        ORGANISM = m_organism,
        SPECIES = m_organism,
        SOURCE = m_source,
        SOURCEVERSION = as.character(m_source_version),
        RELEASEDATE = m_source_date,
        SOURCEURL =  m_source_url,
        DBOBJNAME = pkg_name
    )
    createPackage(pkgname = pkg_name, destinationDir = path,
                  originDir = template_path, symbolValues = symvals)
    sqlite_path <- file.path(path, pkg_name, "inst", "extdata")
    dir.create(sqlite_path, showWarnings = FALSE, recursive = TRUE)
    file.copy(x, to = file.path(sqlite_path, paste0(pkg_name, ".sqlite")))
    invisible(file.path(path, pkg_name))
}

#' @description Simple test function to evaluate whether a file is an SDF file
#'     based on its file name.
#'
#' @param x `character(1)` the file name to be tested.
#'
#' @noRd
#'
#' @md
#'
#' @author Johannes Rainer
.is_sdf_filename <- function(x) {
    grepl(".sdf($|.gz$)", x, ignore.case = TRUE)
}

#' @description `make_metadata()` helps generating a metadata `data.frame` in
#'     the correct format expected by the `createCompDb` function. The function
#'     returns a `data.frame`.
#'
#' @param source For `make_metadata()`: `character(1)` with the name of the
#'     resource that provided the compound annotation.
#'
#' @param url For `make_metadata()`: `character(1)` with the url to the original
#'     resource.
#'
#' @param source_version For `make_metadata()`: `character(1)` with the version
#'     of the original resource providing the annotation.
#'
#' @param source_date For `make_metadata()`: `character(1)` with the date of
#'     the resource's release.
#'
#' @param organism For `make_metadata()`: `character(1)` with the name of the
#'     organism. This should be in the format `"Hsapiens"` for human,
#'     `"Mmusculus"` for mouse etc. Leave to `NA` if not applicable.
#'
#' @export
#'
#' @rdname createCompDb
make_metadata <- function(source = character(), url = character(),
                          source_version = character(),
                          source_date = character(),
                          organism = NA_character_) {
    if (!length(source) || !length(url) || !length(source_version) ||
        !length(source_date) || !length(organism))
        stop("Arguments 'source', 'url', 'source_version', 'source_date' ",
             "and 'organism' are required")
    data.frame(name = c("source", "url", "source_version", "source_date",
                        "organism"),
               value = as.character(c(source, url, source_version,
                                      source_date, organism)),
               stringsAsFactors = FALSE)
}

#' @title Import compound and spectrum information from MoNa
#'
#' @description
#'
#' `import_mona_sdf()` allows to import compound and spectrum information from
#' an SDF file from MoNa (Massbank of North America
#' http://mona.fiehnlab.ucdavis.edu/). This function is a convenience function
#' using the [compound_tbl_sdf()] and [msms_spectra_mona()] functions for data
#' import but avoiding to read the SDF files twice.
#'
#' @note
#'
#' MoNa SDF files organize the data by individual spectra (i.e. each element
#' is one spectrum) and individual compounds can not easily and consistently
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compound
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#' @param x `character(1)` being the SDF file name.
#'
#' @param nonStop `logical(1)` whether file content specific errors should
#'     only reported as warnings and not break the full import process. The
#'     value of this parameter is passed to parameter `skipErrors` of the
#'     [read.SDFset()] function.
#'
#' @author Johannes Rainer
#'
#' @return A `list` with elements `"compound"` and `"msms_spectrum"` containing
#'     data.frames with compound and MS/MS spectrum data, respectively.
#'
#' @export
#'
#' @seealso
#'
#' [compound_tbl_sdf()] to read only the compound information.
#'
#' [msms_spectra_mona()] to read only the spectrum data.
#'
#' @examples
#'
#' ## Define the test file containing a small subset from MoNa
#' fl <- system.file("sdf/MoNa_export-All_Spectra_sub.sdf.gz",
#'     package = "CompoundDb")
#'
#' ## Import the data
#' res <- import_mona_sdf(fl)
import_mona_sdf <- function(x, nonStop = TRUE) {
    .import_mona_sdf(x, nonStop, TRUE)
}

.import_mona_sdf <- function(x, nonStop = TRUE, compounds = TRUE) {
    message("Reading SDF file ... ", appendLF = FALSE)
    sdfs <- datablock2ma(datablock(read.SDFset(x, skipErrors = nonStop)))
    if (!any(colnames(sdfs) == "MASS SPECTRAL PEAKS"))
        stop("The provided file does not contain \"MASS SPECTRAL PEAKS\" ",
             "elements. Is this an SDF file from MoNa?")
    message("OK")
    if (compounds) {
        message("Extracting compound information ... ", appendLF = FALSE)
        suppressMessages(cmps <- .simple_extract_compounds_sdf(sdfs))
        message("OK")
    } else cmps <- data.frame()
    message("Extracting spectrum information ... ", appendLF = FALSE)
    spctra <- .extract_spectra_mona_sdf(sdfs)
    message("OK")
    list(compound = cmps, msms_spectrum = spctra)
}

#' @importFrom RSQLite SQLITE_RW
#'
#' @export
#'
#' @rdname createCompDb
emptyCompDb <- function(dbFile = character()) {
    if (file.exists(dbFile))
        stop("The file \"", dbFile, "\" does already exist.")
    ## Initialize tables.
    comps <- data.frame(compound_id = character(), name = character(),
                        inchi = character(), inchikey = character(),
                        formula = character(), exactmass = numeric(),
                        synonyms = character())
    metad <- data.frame(name = c("source", "url", "source_version",
                                 "source_date", "organism"),
                        value = NA_character_)
    ## source, url, source_version, source_date, organism.
    createCompDb(comps, metad, path = dirname(dbFile),
                 dbFile = basename(dbFile))
    CompDb(dbFile, flags = SQLITE_RW)
}
EuracBiomedicalResearch/CompoundDb documentation built on Oct. 1, 2024, 2:16 p.m.