massdatabase: Play with databases about mass spectrometry

Documented in request_foodb_compound_info_crawler

#' Scrape Compound Information from FooDB
#'
#' This function scrapes compound information from the FooDB website. It visits multiple pages of the compounds section, extracts the data from the HTML tables on each page, and combines the results into a single data frame.
#'
#' @param url A character string specifying the base URL of the FooDB compounds page. Default is `"https://foodb.ca/compounds"`.
#' @param sleep A numeric value indicating the number of seconds to pause between requests to avoid overwhelming the server. Default is `1` second.
#' @param pages A numeric vector indicating which pages to scrape. Default is `1:2838`, which covers all pages on the FooDB compounds section.
#'
#' @return A data frame containing the combined table data from all the specified pages. Each row corresponds to one compound entry from the scraped pages.
#'
#' @details
#' The function uses the `purrr::map` function to iterate over the pages and scrape data from each page. The data on each page is extracted using `rvest::html_table` and combined into a single data frame.
#'
#' @examples
#' \dontrun{
#' # Scrape the first 3 pages with a 2-second delay between requests:
#' data <- request_foodb_compound_info_crawler(
#'   pages = 1:3,
#'   sleep = 2
#' )
#' head(data)
#' }
#'
#' @importFrom purrr map
#' @importFrom rvest read_html html_table
#' @importFrom dplyr %>%
#' @export

request_foodb_compound_info_crawler <-
  function(url = "https://foodb.ca/compounds",
           sleep = 1,
           pages = c(1:2838)) {
    result <-
      purrr::map(pages, function(idx) {
        cat(idx, " ")
        Sys.sleep(sleep)
        new_url <-
          paste0(url, "?page=", idx)
        x <-
          rvest::read_html(x = new_url)

        x <-
          x %>%
          rvest::html_table(fill = TRUE) %>%
          `[[`(1)
        x
      }) %>%
      do.call(rbind, .) %>%
      as.data.frame()
    invisible(result)
  }



#' @title Request one specific the compound information in FoodB
#' @description Request one specific the compound information in FoodB
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param url Default is "https://foodb.ca/compounds".
#' @param compound_id compound id. For example, FDB000004.
#' @param return_form data.frame or list.
#' @return A data frame or list.
#' @importFrom XML xmlTreeParse xmlToList
#' @importFrom magrittr %>%
#' @export
#' @examples
#' x = request_foodb_compound( compound_id = "FDB000004", return_form = "list")
#' x[1:2]
#' y = request_foodb_compound(compound_id = "FDB000004", return_form = "data.frame")
#' head(y)

request_foodb_compound <-
  function(url = "https://foodb.ca/compounds",
           compound_id = "FDB000004",
           return_form = c("list" , "data.frame")) {
    return_form <- match.arg(return_form)
    result <-
      readLines(paste0(url, "/", compound_id, ".xml"), warn = FALSE)
    result <-
      XML::xmlTreeParse(file = result, asText = TRUE)
    result <-
      XML::xmlToList(result)

    # names(result)

    if (return_form == "list") {
      result <- result
    } else{
      version <- result$version
      foods <- paste(unlist(result$foods[1, ]), collapse = "{}")
      result <-
        data.frame(
          version = version,
          creation_date = result$creation_date,
          update_date = result$update_date,
          accession = result$accession,
          name = result$name,
          description = result$description,
          synonyms = paste(unlist(result$synonyms), collapse = "{}"),
          chemical_formula = result$chemical_formula,
          average_molecular_weight = as.numeric(result$average_molecular_weight),
          monisotopic_moleculate_weight = as.numeric(result$monisotopic_moleculate_weight),
          iupac_name = ifelse(is.null(result$iupac_name), NA, result$iupac_name),
          traditional_iupac = ifelse(
            is.null(result$traditional_iupac),
            NA,
            result$traditional_iupac
          ),
          cas_registry_number = ifelse(
            is.null(result$cas_registry_number),
            NA,
            result$cas_registry_number
          ),
          smiles = ifelse(is.null(result$smiles), NA, result$smiles),
          inchi = ifelse(is.null(result$inchi), NA, result$inchi),
          inchikey = ifelse(is.null(result$inchikey), NA, result$inchikey),
          state = ifelse(is.null(result$state), NA, result$state),
          pathways = ifelse(is.null(result$pathways), NA, result$pathways),
          hmdb_id = ifelse(is.null(result$hmdb_id), NA, result$hmdb_id),
          pubchem_compound_id = ifelse(
            is.null(result$pubchem_compound_id),
            NA,
            result$pubchem_compound_id
          ),
          chemspider_id = ifelse(is.null(result$chemspider_id),
                                 NA,
                                 result$chemspider_id),
          kegg_id = ifelse(is.null(result$kegg_id), NA, result$kegg_id),
          chebi_id = ifelse(is.null(result$chebi_id), NA, result$chebi_id),
          biocyc_id = ifelse(is.null(result$biocyc_id), NA, result$biocyc_id),
          het_id = ifelse(is.null(result$het_id), NA, result$het_id),
          wikipidia = ifelse(is.null(result$wikipidia), NA, result$wikipidia),
          vmh_id = ifelse(is.null(result$vmh_id), NA, result$vmh_id),
          fbonto_id = ifelse(is.null(result$fbonto_id), NA, result$fbonto_id),
          foodb_id = ifelse(is.null(result$foodb_id), NA, result$foodb_id),
          general_references = ifelse(
            is.null(result$general_references),
            NA,
            result$general_references
          ),
          foods = foods,
          flavors = ifelse(is.null(result$flavors), NA, result$flavors),
          enzymes = ifelse(is.null(result$enzymes), NA, result$enzymes),
          health_effects = ifelse(
            is.null(result$health_effects),
            NA,
            result$health_effects
          )
        )
    }
    invisible(result)
  }

#' @title Request all the food information in FoodB
#' @description Request all the food information in FoodB
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param url Default is "https://foodb.ca/downloads".
#' @param sleep sleep default is 1 second.
#' @param pages default is from 1:2838
#' @return A data frame.
#' @importFrom rvest read_html html_table
#' @importFrom magrittr %>%
#' @importFrom utils download.file untar
#' @export
#' @examples
#' x = request_foodb_compound_info()
#' head(x)

request_foodb_compound_info <-
  function(url = c(
    "https://raw.githubusercontent.com/jaspershen/databases/main/data/foodb_compound_info.rda",
    "https://foodb.ca/downloads",
    "https://foodb.ca/compounds"
  ),
  sleep = 1,
  pages = 1:2838) {
    url <- match.arg(url)
    if (url == "https://raw.githubusercontent.com/jaspershen/databases/main/data/foodb_compound_info.rda") {
      temp_file <- tempfile()
      dir.create(temp_file, showWarnings = FALSE)
      options(timeout = 10000)
      utils::download.file(url = url,
                           destfile = file.path(temp_file, "foodb_compound_info.rda"))

      load(file.path(temp_file, "foodb_compound_info.rda"))

      unlink(temp_file)
      x <-
        food_compound_info %>%
        dplyr::select(public_id, name)
      return(x)
    }

    if (url ==  "https://foodb.ca/downloads") {
      temp_file <- tempfile()
      dir.create(temp_file, showWarnings = FALSE)
      options(timeout = 10000)
      utils::download.file(url = "https://foodb.ca/public/system/downloads/foodb_2020_4_7_csv.tar.gz",
                           destfile = file.path(temp_file, "foodb_2020_4_7_csv.tar"))

      utils::untar(tarfile = file.path(temp_file, "foodb_2020_4_7_csv.tar"),
                   exdir = temp_file)

      food_compound_info <-
        readr::read_csv(file.path(temp_file,
                                  "foodb_2020_04_07_csv/Compound.csv"),
                        show_col_types = FALSE)

      unlink(temp_file)
      x <-
        food_compound_info %>%
        dplyr::select(public_id, name)
      return(x)
    }

    if (url == "https://foodb.ca/compounds") {
      request_foodb_compound_info_crawler(sleep = sleep,
                                          pages = pages)
    }

  }



#' @title Request MS2 spectra of one compound in FoodB
#' @description Request MS2 spectra of one compound in FoodB
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param compound_id compound id. For example, FDB000004.
#' @return A data frame
#' @importFrom XML xmlTreeParse xmlToList
#' @importFrom magrittr %>%
#' @importFrom rvest read_html html_element html_attr
#' @export
#' @examples
#' x = request_foodb_compound_ms2(compound_id = "FDB000004")

request_foodb_compound_ms2 <-
  function(compound_id = "FDB000013") {
    url <- paste0("https://foodb.ca/compounds/", compound_id)

    result <-
      readLines(paste0(url, ".xml"), warn = FALSE)

    result <-
      XML::xmlTreeParse(file = result, asText = TRUE)

    result <-
      XML::xmlToList(result)

    spectra <-
      result$spectra %>%
      as.data.frame()

    idx <-
      which(unname(unlist(spectra[1, ])) == "Specdb::MsMs")

    if (length(idx) == 0) {
      message('No MS/MS.')
      return(NA)
    }

    ms2_id <-
      unname(unlist(spectra[2, idx, drop = TRUE]))

    ms2_url <- "https://foodb.ca/spectra/ms_ms/"

    ms2_spectra <-
      lapply(ms2_id, function(temp_id) {
        # cat(temp_id, " ")
        html_document <-
          rvest::read_html(paste0(ms2_url, temp_id))
        link <-
          html_document %>%
          rvest::html_element("tr:nth-child(1) a") %>%
          rvest::html_attr("href")
        if (is.na(link)) {
          return(NULL)
        }
        ms2 <-
          tryCatch(read.table(link, header = FALSE), error = function(e){NULL})
        if (is.null(ms2)) {
          return(NULL)
        }
        colnames(ms2) <-
          c("mz", "intensity")

        ms1_info <-
          html_document %>%
          rvest::html_table()

        ms1_info <-
          rbind(ms1_info[[1]],
                ms1_info[[2]])

        colnames(ms1_info) <-
          c("name", "value")

        list(ms1_info = ms1_info,
             ms2 = ms2)

      })

    names(ms2_spectra) <- ms2_id
    ms2_spectra

  }



#' #' @title Request one specific the food information in FoodB
#' #' @description Request one specific the food information in FoodB
#' #' @author Xiaotao Shen
#' #' \email{shenxt1990@@outlook.com}
#' #' @param url Default is "https://foodb.ca/foods".
#' #' @param food_id food id. For example, FOOD00971
#' #' @param return_form data.frame or list.
#' #' @return A data frame or list.
#' #' @importFrom XML xmlTreeParse xmlToList
#' #' @importFrom magrittr %>%
#' #' @export
#' #' @examples
#' #' x = request_foodb_food( food_id = "FOOD00971", return_form = "list")
#' #' x
#' #' y = request_foodb_food( food_id = "FOOD00971", return_form = "data.frame")
#' #'
#'
#' request_foodb_food <-
#'   function(url = "https://foodb.ca/foods",
#'            food_id = "FOOD00971",
#'            return_form = c("list" ,"data.frame")) {
#'     return_form <- match.arg(return_form)
#'     result <-
#'       readLines(paste0(url, "/", food_id, ".xml"))
#'     result <-
#'       XML::xmlTreeParse(file = result, asText = TRUE)
#'     result <-
#'       XML::xmlToList(result)
#'
#'     # names(result)
#'
#'     if (return_form == "list") {
#'       result <- result
#'     } else{
#'       version <- result$version
#'       foods <- paste(unlist(result$foods[1, ]), collapse = "{}")
#'       result <-
#'         data.frame(
#'           version = version,
#'           creation_date = result$creation_date,
#'           update_date = result$update_date,
#'           accession = result$accession,
#'           name = result$name,
#'           description = result$description,
#'           synonyms = paste(unlist(result$synonyms), collapse = "{}"),
#'           chemical_formula = result$chemical_formula,
#'           average_molecular_weight = as.numeric(result$average_molecular_weight),
#'           monisotopic_moleculate_weight = as.numeric(result$monisotopic_moleculate_weight),
#'           iupac_name = ifelse(is.null(result$iupac_name), NA, result$iupac_name),
#'           traditional_iupac = ifelse(
#'             is.null(result$traditional_iupac),
#'             NA,
#'             result$traditional_iupac
#'           ),
#'           cas_registry_number = ifelse(
#'             is.null(result$cas_registry_number),
#'             NA,
#'             result$cas_registry_number
#'           ),
#'           smiles = ifelse(is.null(result$smiles), NA, result$smiles),
#'           inchi = ifelse(is.null(result$inchi), NA, result$inchi),
#'           inchikey = ifelse(is.null(result$inchikey), NA, result$inchikey),
#'           state = ifelse(is.null(result$state), NA, is.null(result$state)),
#'           pathways = ifelse(is.null(result$pathways), NA, is.null(result$pathways)),
#'           hmdb_id = ifelse(is.null(result$hmdb_id), NA, is.null(result$hmdb_id)),
#'           pubchem_food_id = ifelse(
#'             is.null(result$pubchem_food_id),
#'             NA,
#'             is.null(result$pubchem_food_id)
#'           ),
#'           chemspider_id = ifelse(
#'             is.null(result$chemspider_id),
#'             NA,
#'             is.null(result$chemspider_id)
#'           ),
#'           kegg_id = ifelse(is.null(result$kegg_id), NA, is.null(result$kegg_id)),
#'           chebi_id = ifelse(is.null(result$chebi_id), NA, is.null(result$chebi_id)),
#'           biocyc_id = ifelse(is.null(result$biocyc_id), NA, is.null(result$biocyc_id)),
#'           het_id = ifelse(is.null(result$het_id), NA, is.null(result$het_id)),
#'           wikipidia = ifelse(is.null(result$wikipidia), NA, is.null(result$wikipidia)),
#'           vmh_id = ifelse(is.null(result$vmh_id), NA, is.null(result$vmh_id)),
#'           fbonto_id = ifelse(is.null(result$fbonto_id), NA, is.null(result$fbonto_id)),
#'           foodb_id = ifelse(is.null(result$foodb_id), NA, is.null(result$foodb_id)),
#'           general_references = ifelse(
#'             is.null(result$general_references),
#'             NA,
#'             is.null(result$general_references)
#'           ),
#'           foods = foods,
#'           flavors = ifelse(is.null(result$flavors), NA, is.null(result$flavors)),
#'           enzymes = ifelse(is.null(result$enzymes), NA, is.null(result$enzymes)),
#'           health_effects = ifelse(
#'             is.null(result$health_effects),
#'             NA,
#'             is.null(result$health_effects)
#'           )
#'         )
#'     }
#'     invisible(result)
#'   }








#' @title Download FOODB compound data
#' @description Download FOODB compound data
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param compound_id "all": download all compounds, or a vector of compound IDs.
#' @param path Default is .
#' @return FOODB compound database, rda format.
#' @export

download_foodb_compound <-
  function(compound_id = "all",
           path = ".") {
    dir.create(path, recursive = TRUE, showWarnings = FALSE)

    foodb_id <-
      request_foodb_compound_info(url =
                                    "https://raw.githubusercontent.com/jaspershen/databases/main/data/foodb_compound_info.rda")

    if (all(compound_id != "all")) {
      foodb_id <-
        foodb_id %>%
        dplyr::filter(public_id %in% compound_id)
    }

    foodb_id <-
      foodb_id$public_id

    pb <- progress::progress_bar$new(total = length(foodb_id))

    foodb_compound_database <-
      seq_along(foodb_id) %>%
      purrr::map(function(i) {
        pb$tick()
        result <-
          tryCatch(
            request_foodb_compound(compound_id = foodb_id[i]),
            error = function(e)
              NULL
          )
        if (is.null(result)) {
          return(NULL)
        } else{
          return(result)
        }
      })

    save(foodb_compound_database,
         file = file.path(path, "foodb_compound_database"))
  }






#' @title Read the FOODB compound database from download_foodb_compound function
#' @description Read the FOODB compound database from download_foodb_compound function
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param path Default is .. Should be same with download_foodb_compound function.
#' @return A data frame
#' @importFrom magrittr %>%
#' @importFrom plyr dlply .
#' @importFrom readr read_delim
#' @importFrom dplyr mutate bind_rows select distinct rename full_join filter
#' @importFrom tidyr pivot_wider
#' @importFrom purrr map
#' @export
read_foodb_compound <-
  function(path = ".") {
    load(file.path(path, "foodb_compound_database"))

    pb <-
      progress::progress_bar$new(total = length(foodb_compound_database))

    foodb_result <-
      seq_along(foodb_compound_database) %>%
      purrr::map(function(i) {
        # cat(x$accession, " ")
        pb$tick()
        x <- foodb_compound_database[[i]]
        if (is.null(x)) {
          return(NULL)
        }
        Kingdom <-
          ifelse(is.null(x$taxonomy$kingdom), NA, x$taxonomy$kingdom)
        Super_class <-
          ifelse(is.null(x$taxonomy$super_class),
                 NA,
                 x$taxonomy$super_class)
        Class <-
          ifelse(is.null(x$taxonomy$class), NA, x$taxonomy$class)
        Sub_class <-
          ifelse(is.null(x$taxonomy$sub_class), NA, x$taxonomy$sub_class)
        State <- ifelse(is.null(x$state), NA, x$state)
        HMDB.ID <- ifelse(is.null(x$hmdb_id), NA, x$hmdb_id)
        PUBCHEM.ID <-
          ifelse(is.null(x$pubchem_compound_id),
                 NA,
                 x$pubchem_compound_id)
        CHEMSPIDER.ID <-
          ifelse(is.null(x$chemspider_id), NA, x$chemspider_id)
        KEGG.ID <- ifelse(is.null(x$kegg_id), NA, x$kegg_id)
        CHEBI.ID <- ifelse(is.null(x$chebi_id), NA, x$chebi_id)
        BIOCYC.ID <- ifelse(is.null(x$biocyc_id), NA, x$biocyc_id)
        HET.ID <- ifelse(is.null(x$het_id), NA, x$het_id)
        WIKIPEDIA.ID <-
          ifelse(is.null(x$wikipidia), NA, x$wikipidia)
        VMH.ID <- ifelse(is.null(x$vmh_id), NA, x$vmh_id)
        Synonyms = ifelse(is.null(x$synonyms), NA, paste(unname(unlist(x$synonyms)), collapse = "{}"))
        foods <- x$foods
        if (is.null(foods)) {
          Food_name <- NA
          Food_type <- NA
          Food_category <- NA
          Food_scientific_name <- NA
        } else{
          if (class(foods)[1] == "list") {
            foods <-
              foods %>%
              lapply(function(y) {
                data.frame(
                  name = ifelse(is.null(y$name), NA, y$name),
                  food_type = ifelse(is.null(y$food_type), NA, y$food_type),
                  category = ifelse(is.null(y$category), NA, y$category),
                  name_scientific = ifelse(is.null(y$name_scientific), NA, y$name_scientific)
                )
              }) %>%
              dplyr::bind_rows() %>%
              as.data.frame()

            Food_name <- as.character(foods[, "name"]) %>%
              paste(collapse = "{}")
            Food_type <- as.character(foods[, "food_type"]) %>%
              paste(collapse = "{}")
            Food_category <- as.character(foods[, "category"]) %>%
              paste(collapse = "{}")
            Food_scientific_name <-
              as.character(foods[, "name_scientific"]) %>%
              paste(collapse = "{}")

          } else{
            colnames(foods) <- paste("V", 1:ncol(foods), sep = "")

            foods <-
              foods[1:4, , drop = FALSE] %>%
              as.data.frame() %>%
              apply(2, function(y) {
                y <-
                  y %>%
                  lapply(function(z) {
                    if (is.null(z)) {
                      return(NA)
                    } else{
                      return(z)
                    }
                  }) %>%
                  unlist()
                y
              })
            Food_name <- as.character(foods["name",]) %>%
              paste(collapse = "{}")
            Food_type <- as.character(foods["food_type",]) %>%
              paste(collapse = "{}")
            Food_category <- as.character(foods["category",]) %>%
              paste(collapse = "{}")
            Food_scientific_name <-
              as.character(foods["name_scientific",]) %>%
              paste(collapse = "{}")
          }
        }

        data.frame(
          Lab.ID = x$accession,
          Create_date = ifelse(is.null(x$creation_date), NA, x$creation_date),
          Updated_date = ifelse(is.null(x$update_date), NA, x$update_date),
          Compound.name = ifelse(is.null(x$name), NA, x$name),
          Description = ifelse(is.null(x$description), NA, x$description),
          Formula = ifelse(is.null(x$chemical_formula), NA, x$chemical_formula),
          Synonyms = Synonyms,
          Average.mass =  ifelse(
            is.null(x$average_molecular_weight),
            NA,
            x$average_molecular_weight
          ),
          mz = ifelse(
            is.null(x$monisotopic_moleculate_weight),
            NA,
            x$monisotopic_moleculate_weight
          ),
          IUPAC_name = ifelse(is.null(x$iupac_name), NA, x$iupac_name),
          Traditional_IUPAC_name = ifelse(is.null(x$traditional_iupac), NA, x$traditional_iupac),
          CAS.ID = ifelse(
            is.null(x$cas_registry_number),
            NA,
            x$cas_registry_number
          ),
          SMILES.ID = ifelse(is.null(x$smiles), NA, x$smiles),
          INCHI.ID = ifelse(is.null(x$inchi), NA, x$inchi),
          INCHIKEY.ID = ifelse(is.null(x$inchikey), NA, x$inchikey),
          Kingdom = Kingdom,
          Super_class = Super_class,
          Class = Class,
          Sub_class = Sub_class,
          State = State,
          FOODB.ID = ifelse(is.null(x$accession), NA, x$accession),
          HMDB.ID = HMDB.ID,
          PUBCHEM.ID = PUBCHEM.ID,
          CHEMSPIDER.ID = CHEMSPIDER.ID,
          KEGG.ID = KEGG.ID,
          CHEBI.ID = CHEBI.ID,
          BIOCYC.ID = BIOCYC.ID,
          HET.ID = HET.ID,
          WIKIPEDIA.ID = WIKIPEDIA.ID,
          VMH.ID = VMH.ID,
          Food_name,
          Food_type,
          Food_category,
          Food_scientific_name
        )
      }) %>%
      dplyr::bind_rows() %>%
      as.data.frame()

    foodb_result$mz <-
      as.numeric(foodb_result$mz)

    foodb_result$RT <- NA
    foodb_result$From_food <- TRUE
    foodb_result$mz.pos = NA
    foodb_result$mz.neg = NA
    foodb_result$Submitter = "FOODB"

    foodb_result <-
      foodb_result %>%
      dplyr::filter(!is.na(mz) & !is.na(Formula))

    return(foodb_result)
  }



#' @title Convert FOODB compound data (list,
#' from download_foodb_compound function)
#' to metID format database
#' @description Convert FOODB compound data (list,
#' from download_foodb_compound function)
#' to metID format database
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param data data.frame, from read_foodb_compound function.
#' @param path Default is .
#' @param threads threads
#' @return metid database class
#' @importFrom magrittr %>%
#' @importFrom plyr . dlply
#' @importFrom metid construct_database
#' @export

convert_foodb2metid <-
  function(data,
           path = ".",
           threads = 5) {
    dir.create(path, showWarnings = FALSE, recursive = TRUE)

    temp_file <- tempfile()
    dir.create(temp_file, showWarnings = FALSE)
    readr::write_csv(x = data,
                     file = file.path(temp_file, "data.csv"))

    foodb_ms1 <-
      metid::construct_database(
        path = temp_file,
        version = as.character(Sys.Date()),
        metabolite.info.name = "data.csv",
        source = "FOODB",
        link = "https://foodb.ca/",
        creater = "Xiaotao Shen",
        email = "shenxt@stanford.edu",
        rt = FALSE,
        threads = threads
      )

    unlink(file.path(temp_file, "data.csv"))
    unlink(temp_file)

    save(foodb_ms1, file = file.path(path, "foodb_ms1"))
    invisible(foodb_ms1)
  }