massdatabase: Play with databases about mass spectrometry

Documented in convert_lipidbank2metid download_lipidbank_lipid_class request_lipidbank_lipid_class

#' @title Download lIPIDBANK database
#' @description Download lIPIDBANK database
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param url Default is "https://lipidbank.jp/download/".
#' @param lipid_class See here: https://lipidbank.jp/index.html.
#' @param path Default is .
#' @return Downloaded files.
#' @importFrom magrittr %>%
#' @export
download_lipidbank_lipid_class <-
  function(url = "https://lipidbank.jp/download/",
           lipid_class = c(
             "All data",
             "Acylglycerol",
             "Bile Acid",
             "Fatty acid",
             "Long chain alcohol",
             "Long chain aldehyde",
             "Long chain base and Ceramide",
             "Eicosanoid",
             "Ether type lipid",
             "Carotenoid",
             "Coenzyme Q",
             "Vitamin A",
             "Vitamin D",
             "Vitamin E",
             "Vitamin F",
             "Vitamin K",
             "Glycosphingolipid",
             "Glycoglycerolipid and others",
             "Isoprenoid",
             "Lipid peroxide",
             "Lipoamino acid",
             "Lipopolysaccharide",
             "Lipoprotein",
             "Mycolic acid",
             "Glycerophospholipid",
             "Sphingophospholipid",
             "Steroid",
             "Wax"
           ),
           path = ".") {
    lipid_class <- match.arg(lipid_class)

    bre <-
      lipid_class_table$url[match(lipid_class, lipid_class_table$lipid_class)]

    url <- paste0("https://lipidbank.jp/download/",
                  bre, ".xlsx")

    message("Downloading...\n")
    download.file(url = url,
                  destfile = file.path(path, paste0(lipid_class, ".xlsx")))
    message("Done.\n")

  }


#' @title Request Lipidbank database
#' @description Request one specific the metabolite information in HMDB
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param url Default is "https://hmdb.ca/metabolites".
#' @param lipid_class lipid_class
#' @return A data frame or list.
#' @importFrom XML xmlTreeParse xmlToList
#' @importFrom magrittr %>%
#' @export
#' @examples
#' x = request_lipidbank_lipid_class(lipid_class = "Wax")
#' head(x)

request_lipidbank_lipid_class <-
  function(url = "https://lipidbank.jp/",
           lipid_class = c(
             "All data",
             "Acylglycerol",
             "Bile Acid",
             "Fatty acid",
             "Long chain alcohol",
             "Long chain aldehyde",
             "Long chain base and Ceramide",
             "Eicosanoid",
             "Ether type lipid",
             "Carotenoid",
             "Coenzyme Q",
             "Vitamin A",
             "Vitamin D",
             "Vitamin E",
             "Vitamin F",
             "Vitamin K",
             "Glycosphingolipid",
             "Glycoglycerolipid and others",
             "Isoprenoid",
             "Lipid peroxide",
             "Lipoamino acid",
             "Lipopolysaccharide",
             "Lipoprotein",
             "Mycolic acid",
             "Glycerophospholipid",
             "Sphingophospholipid",
             "Steroid",
             "Wax"
           )) {
    lipid_class <- match.arg(lipid_class)
    bre <-
      lipid_class_table$url[match(lipid_class, lipid_class_table$lipid_class)]

    url <- paste0(url, bre, ".html")

    result <-
      tryCatch(
        rvest::read_html(url) %>%
          rvest::html_table(),
        error = function(e) {
          NULL
        }
      )

    if (is.null(result)) {
      message("Check your internet.")
      return(NULL)
    }

    result <- result[[2]] %>%
      as.data.frame()

    colnames(result) <-
      result[1, ] %>%
      as.character()

    result <-
      result[-1, ]

    result
  }


#' @title Convert lipid bank data (data.frame) to metID format database
#' @description Convert lipid bank data (data.frame) to metID format database
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param data data.frame
#' @return Downloaded files.
#' @importFrom magrittr %>%
#' @export

convert_lipidbank2metid <-
  function(data) {
    data <-
      data %>%
      dplyr::select(
        -c(
          Image,
          "MASS SPECTRA",
          "OTHER SPECTRA",
          "NMR SPECTRA",
          "IR SPECTRA",
          "UV SPECTRA",
          "Download cdx file / Mol format file",
          "BOILING POINT",
          "DENSITY",
          "REFRACTIVE INDEX",
          "CHEMICAL SYNTHESIS",
          "OPTICAL ROTATION",
          "SOLUBILITY",
          "CHROMATOGRAM DATA",
          "CHEMICAL SYNTHESIS",
          "METABOLISM",
          "GENETIC INFORMATION",
          "NOTE",
          "REFERENCES",
          "Id",
          "MELTING POINT"
        )
      )

    data <-
      data %>%
      dplyr::rename(
        Lab.ID = `DATA No`,
        Compound.name = NAME,
        Informant = INFORMANT,
        Synonyms = `COMMON NAME`,
        Symbol = SYMBOL,
        Main_class_lipidbank = `Lipid class`,
        Average.mass = `MOL.WT(average)`,
        Formula = FORMULA,
        Biological_activity = `BIOOGICAL ACTIVITY`,
        Source = SOURCE
      ) %>%
      dplyr::mutate(
        LIPIDBANK.ID = Lab.ID,
        CAS.ID = NA,
        HMDB.ID = NA,
        KEGG.ID = NA,
        RT = NA,
        mz = NA,
        mz.pos = NA,
        mz.neg = NA,
        Submitter = "LIPIDBANK"
      ) %>%
      dplyr::select(
        Lab.ID,
        Compound.name,
        mz,
        RT,
        CAS.ID,
        HMDB.ID,
        KEGG.ID,
        Formula,
        mz.pos,
        mz.neg,
        Submitter,
        everything()
      )

    data <-
      data %>%
      dplyr::distinct(Lab.ID, .keep_all = TRUE) %>%
      dplyr::filter(!is.na(Compound.name) & !is.na(Formula))


    data$Compound.name <-
      data$Compound.name %>%
      stringr::str_replace_all(' \\<\\<[ a-zA-Z\\.\\/0-9]{1,40}\\>\\> ', "") %>%
      stringr::str_replace_all('\\<\\<[ a-zA-Z\\.\\/0-9]{1,40}\\>\\>', "") %>%
      stringr::str_replace_all('\\\"', "") %>%
      stringr::str_replace_all('\\.$', "") %>%
      stringr::str_replace_all(' \\/ ', "{}") %>%
      stringr::str_replace_all(' \\/', "{}") %>%
      stringr::str_replace_all('\\/ ', "{}") %>%
      stringr::str_replace_all('\\/', "{}") %>%
      stringr::str_replace_all('\\/ \\/', "{}") %>%
      stringr::str_replace_all('\\{\\}$', "") %>%
      stringr::str_replace_all('^\\{\\}', "") %>%
      stringr::str_trim()

    data$Synonyms <-
      data$Synonyms %>%
      stringr::str_replace_all(' \\<\\<[ a-zA-Z\\.\\/0-9]{1,40}\\>\\> ', "") %>%
      stringr::str_replace_all('\\<\\<[ a-zA-Z\\.\\/0-9]{1,40}\\>\\>', "") %>%
      stringr::str_replace_all('\\\"', "") %>%
      stringr::str_replace_all('\\.$', "") %>%
      stringr::str_replace_all(' \\/ ', "{}") %>%
      stringr::str_replace_all(' \\/', "{}") %>%
      stringr::str_replace_all('\\/ ', "{}") %>%
      stringr::str_replace_all('\\/', "{}") %>%
      stringr::str_replace_all('\\/ \\/', "{}") %>%
      stringr::str_replace_all('\\{\\}$', "") %>%
      stringr::str_replace_all('^\\{\\}', "") %>%
      stringr::str_trim()

    data[which(data == "", arr.ind = TRUE)] <- NA

    data <-
      data %>%
      dplyr::filter(!is.na(Compound.name) | !is.na(Synonyms))

    new_name <-
      1:nrow(data) %>%
      purrr::map(function(i) {
        # cat(i, " ")
        name1 <- data$Compound.name[i]
        name2 <- data$Synonyms[i]

        name <-
          c(name1, name2)

        name <-
          name[!is.na(name)]

        name <-
          name %>%
          stringr::str_split(pattern = "\\{\\}") %>%
          unlist() %>%
          unique()

        name <-
          name[order(nchar(name), name)]

        Synonyms = paste(name, collapse = "{}")
        name3 <-
          name[!stringr::str_detect(name, "from|Lipid")]

        if (length(name3) == 0) {
          Compound.names <- name[1]
        } else{
          Compound.names <- name3[1]
        }

        data.frame(Compound.name = Compound.names,
                   Synonyms = Synonyms)
      }) %>%
      dplyr::bind_rows() %>%
      as.data.frame()

    data$Compound.name <- new_name$Compound.name
    data$Synonyms <- new_name$Synonyms

    data <-
      data %>%
      dplyr::distinct(Compound.name, Synonyms, .keep_all = TRUE)

    data <-
      data %>%
      dplyr::filter(!is.na(Formula)) %>%
      dplyr::filter(nchar(Formula) > 1)

    data$mz <-
      data$Formula %>%
      purrr::map(function(x) {
        value <-
          tryCatch(
            Rdisop::getMass(Rdisop::getMolecule(x)),
            error = function(e) {
              NA
            }
          )

      }) %>%
      unlist() %>%
      as.numeric()

    data <-
      data %>%
      dplyr::filter(!is.na(mz))
    data$Average.mass <- as.numeric(data$Average.mass)

    ###species
    Species <-
      data$Source %>%
      purrr::map(function(x) {
        if (is.na(x)) {
          return(NA)
        }

        x <-
          x %>%
          stringr::str_replace_all("from African", "") %>%
          stringr::str_replace_all("from Berber", "") %>%
          stringr::str_replace_all("from Chinese", "") %>%
          stringr::str_replace_all("from India", "") %>%
          stringr::str_replace_all("from Korean", "") %>%
          stringr::str_replace_all("\\[Table [0-9]{1,5}\\]", "") %>%
          stringr::str_replace_all("other references\\:", "") %>%
          stringr::str_replace_all("\\(1\\% of total\\)", "")

        ######if it is separated by ;
        if (stringr::str_detect(x, ";")) {
          x2 <-
            x %>%
            # stringr::str_replace_all(x, "\\(Bacteroides\\)", "") %>%
            stringr::str_replace_all(" and ", " ") %>%
            stringr::str_replace_all("  ", " ") %>%
            stringr::str_replace("\\.$", "") %>%
            stringr::str_replace("\\;$", "") %>%
            stringr::str_replace("^\\;", "") %>%
            stringr::str_replace_all("\\/", "") %>%
            stringr::str_replace_all(",", " ") %>%
            stringr::str_replace_all("  ", " ") %>%
            stringr::str_replace_all(pattern = "\\<\\<[ a-zA-Z0-9\\.\\/]{5,50}\\>\\>", "") %>%
            stringr::str_trim()

          x2 <-
            x2 %>%
            stringr::str_split(";") %>%
            `[[`(1) %>%
            stringr::str_trim()

          x2 <- x2[x2 != ""]
          x2 <-
            x2 %>%
            stringr::str_replace_all("^\\.", "") %>%
            stringr::str_replace_all("\\.$", "") %>%
            stringr::str_trim()
          if (all(unlist(lapply(stringr::str_split(x2, " "), length)) <= 5)) {
            return(paste(x2, collapse = "{}"))
          }

        }

        x <-
          x %>%
          # stringr::str_replace_all(x, "\\(Bacteroides\\)", "") %>%
          stringr::str_replace_all(" and ", " ") %>%
          stringr::str_replace_all("  ", " ") %>%
          stringr::str_replace("\\.$", "") %>%
          stringr::str_replace_all("\\/", "") %>%
          stringr::str_replace_all(",", " ") %>%
          stringr::str_replace_all("  ", " ") %>%
          stringr::str_replace_all(";", " ") %>%
          stringr::str_replace_all("  ", " ") %>%
          stringr::str_trim()

        x2 <-
          stringr::str_split(x, pattern = "\\<\\<[ a-zA-Z0-9\\.\\/]{5,50}\\>\\>")[[1]] %>%
          stringr::str_replace("\\.$", "") %>%
          stringr::str_trim()

        x2 <- x2[x2 != ""]

        x2 <-
          x2 %>%
          lapply(function(y) {
            if (stringr::str_detect(y, "( from ){1}|( of ){1}|( in ){1}")) {
              y <-
                y %>%
                stringr::str_split(pattern = "( from )|( of )|( in )") %>%
                `[[`(1)

              y <- y[-1] %>%
                paste(collapse = " ")

              y <-
                y %>%
                stringr::str_split(" ") %>%
                `[[`(1) %>%
                stringr::str_trim()
              y <-
                y[!y %in% c("in",
                            "of",
                            "the",
                            "many",
                            "from",
                            "and",
                            "during",
                            "with",
                            "a",
                            "high",
                            "or")]
              y <-
                y %>%
                stringr::str_replace_all("^\\.", "") %>%
                stringr::str_replace_all("\\.$", "") %>%
                stringr::str_trim()
              return(paste(y, collapse = " "))

            } else{
              y <-
                y %>%
                stringr::str_replace_all("^\\.", "") %>%
                stringr::str_replace_all("\\.$", "") %>%
                stringr::str_trim()
              return(y)
            }
          }) %>%
          unlist()

        x2 <-
          x2 %>%
          stringr::str_replace_all("^\\.", "") %>%
          stringr::str_replace_all("\\.$", "") %>%
          stringr::str_trim()

        if (all(unlist(lapply(stringr::str_split(x2, " "), length)) <= 5)) {
          return(paste(x2, collapse = "{}"))
        }

        # species1 <-
        #   stringr::str_extract_all(x2, "[a-zA-Z]{3,20} [a-z]{3,20}") %>%
        #   unlist() %>%
        #   unique()
        #
        # species1 <-
        #   species1[!stringr::str_detect(species1, "from|in|of")]

        species2 <-
          stringr::str_extract_all(x2, "[A-Z]{1}\\. [a-z]{3,20}") %>%
          unlist() %>%
          unique()

        species3 <-
          stringr::str_extract_all(x2, "[A-Za-z]{3,20} [a-zA-Z]{1,5}\\.") %>%
          unlist() %>%
          unique()

        # species4 <-
        #   stringr::str_extract_all(
        #     x2,
        #     "Human|human|monkey|fish"
        #   ) %>%
        #   unlist() %>%
        #   unique()

        species5 <-
          x2

        species <-
          c(species2,
            species3,
            species5) %>%
          unique()

        species <-
          species %>%
          stringr::str_replace_all("^\\.", "") %>%
          stringr::str_replace_all("\\.$", "") %>%
          stringr::str_trim()

        species <- species[species != ""]

        if (length(species) == 0) {
          return(NA)
        }
        species <- paste(species, collapse = "{}")
        species
      }) %>%
      unlist()

    # temp <-
    #   data.frame(Source = data$Source,
    #              Species) %>%
    #   dplyr::filter(!is.na(Source) | !is.na(Species))
    #
    # openxlsx::write.xlsx(temp, file = "temp.xlsx", asTable = TRUE)

    data$Species <- Species
    #
    #   all_word <-
    #   Species[!is.na(Species)] %>%
    #     stringr::str_split("\\{\\}") %>%
    #     unlist() %>%
    #     unique()
    #
    #   ##get match table
    #
    #   match_table <-
    #     data.frame(species = stringr::str_to_lower(all_word)) %>%
    #     dplyr::mutate(source = dplyr::case_when(
    #       stringr::str_detect(species, "bacteria|sulfolobus|methanosphaera|pyrococcus") ~ "Bacteria",
    #       stringr::str_detect(species, "mucor|sponge|bacterium|brevundimonas|proteus") ~ "Bacteria",
    #       stringr::str_detect(species, " sp") ~ "Bacteria",
    #       stringr::str_detect(species, "pseudomonas|rhizobium|rhodovulum|rhodospirillum|rhodomicrobium") ~ "Bacteria",
    #       stringr::str_detect(species, "rhodocyclus|salmonella|schizophylum|acinetobacter|actinobacillus") ~ "Bacteria",
    #       stringr::str_detect(species, "aeromonas|aeropyrum|bordetella|campylobacter|thermococcus") ~ "Bacteria",
    #       stringr::str_detect(species, "yersinia|vibrio|vibrio|xanthomonas|ustilago|uredovora|achlya|acetobacter") ~ "Bacteria",
    #       stringr::str_detect(species, "aspergillus|providencia|porphyromonas|streptococcus|strain|bacilli") ~ "Bacteria",
    #       stringr::str_detect(species, "tubercle|anacystis|bacteroides|azospirillum|rhodococcus|rhodobacter|pyramimonas") ~ "Bacteria",
    #       stringr::str_detect(species, "penicillium|pectinatus|pasteurianus|nocardia|neurospora|neisseria|coli") ~ "Bacteria",
    #       stringr::str_detect(species, "fungal|fungus|fungu|schizosaccharomyces|yeast") ~ "Fungi",
    #       stringr::str_detect(species, "human|infant|patient|neonatal|newborn|serum|urine") ~ "Human",
    #       stringr::str_detect(species, "feces|barin|lung|spleen|retina|cortex|female|pregnant|plasma") ~ "Human",
    #       stringr::str_detect(species, "blood|brain|feces|liver|tissue|urinary|adrenal|gestation|testis|skin") ~ "Human",
    #       stringr::str_detect(species, "dog|rat|vertebrates|rabbit|snake|chiken|cattle|chicken") ~ "Animalia",
    #       stringr::str_detect(species, "kangaroos|opossum|pig|koala|mammalian|python") ~ "Animalia",
    #       stringr::str_detect(species, "pelicans|owls|baboon|kite|rana|bullfrog") ~ "Animalia",
    #       stringr::str_detect(species, "bufo|varanus|amyda|fish|moth|drasche|bollworm") ~ "Animalia",
    #       stringr::str_detect(species, "sheep|shark|mouse|animal|paca|moschatus|monkey") ~ "Animalia",
    #       stringr::str_detect(species, "menhaden|boar|bovine|porcine|rodent|rice|mytilus") ~ "Animalia",
    #       stringr::str_detect(species, "prasinophyceae|scallop|quinqueradiata|shigella|worm") ~ "Animalia",
    #       stringr::str_detect(species, "primates|alligator|amphiuma|animal|bird|caiman|whale") ~ "Animalia",
    #       stringr::str_detect(species, "xenopus|bee|grease|wool|latirostris|visceral|viscera|ant") ~ "Animalia",
    #       stringr::str_detect(species, "ascidiacea|marine|asteroidea|clam|sea|porifera|toad|tilapia|thunnus") ~ "Animalia",
    #       stringr::str_detect(species, "atelesto|sturgeon|coral|pernyi|alestes|amoebae|arapaima|potamon|crab|oyster") ~ "Animalia",
    #       stringr::str_detect(species, "ox ") ~ "Animalia",
    #       stringr::str_detect(species, "oncoryhnchus") ~ "Animalia",
    #       stringr::str_detect(species, "sertifer|cyperus|asteraceae|alga|caldariella|plant|codium") ~ "Plantae",
    #       stringr::str_detect(species, "rhamnus|cinchona|cotton|trillium|dioscorea|seed|root|sarsaparilla") ~ "Plantae",
    #       stringr::str_detect(species, "scilla|corn|ruscus|petals|ricinus|sansevieria") ~ "Plantae",
    #       stringr::str_detect(species, "hydnocarpus|palm|acnistus|amaroucium|tunicate|capsicum") ~ "Plantae",
    #       stringr::str_detect(species, "syringae|creeper|pterosperma|gonyaulax|thevetia") ~ "Plantae",
    #       stringr::str_detect(species, "solanaceae|liliaceae|trichosanthes|tragopogon|asclepias|umbellatum") ~ "Plantae",
    #       stringr::str_detect(species, "solanaceae|liliaceae|trichosanthes|tragopogon|asclepias|umbellatum") ~ "Plantae",
    #       stringr::str_detect(species, "thalictrum|leave|tree|leaves|flower|rhizomes|rambutan|racemosa|peucedanum") ~ "Plantae",
    #       stringr::str_detect(species, "neriifolia") ~ "Plantae",
    #       stringr::str_detect(species, "soil|digitonin|diginatin|water|tabacco") ~ "Environment",
    #       stringr::str_detect(species, "pine|ginseng|bean|peach|avocado|carrot|wheat|sultana") ~ "Food_plant",
    #       stringr::str_detect(species, "lemon|cabbage|cocoa|fruit|violet|grape|apple|berries|tanghinin|sunflower") ~ "Food_plant",
    #       stringr::str_detect(species, "apium|banana|peanut|potato|olive") ~ "Food_plant",
    #       stringr::str_detect(species, "milk|lobster|menhaden|marronnier|vinegar|tanghinin|sardine|butter") ~ "Food"
    #     ))

    data("match_table", envir = environment())

    source <-
      data$Species %>%
      stringr::str_to_lower() %>%
      purrr::map(function(x) {
        # cat(x, " ")
        convert_species2source(x = x,
                               match_table = match_table)
      }) %>%
      dplyr::bind_rows() %>%
      as.data.frame()

    data <-
      cbind(data, source) %>%
      as.data.frame()

    invisible(data)

  }
tidymass/massdatabase documentation built on Oct. 18, 2024, 3:56 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
tidymass/massdatabase
Play with databases about mass spectrometry

R/9_LIPIDBANK.R
In tidymass/massdatabase: Play with databases about mass spectrometry

Defines functions convert_lipidbank2metid request_lipidbank_lipid_class download_lipidbank_lipid_class

Documented in convert_lipidbank2metid download_lipidbank_lipid_class request_lipidbank_lipid_class

R Package Documentation

Browse R Packages

We want your feedback!

tidymass/massdatabase Play with databases about mass spectrometry

R/9_LIPIDBANK.R In tidymass/massdatabase: Play with databases about mass spectrometry

Defines functions convert_lipidbank2metid request_lipidbank_lipid_class download_lipidbank_lipid_class

Documented in convert_lipidbank2metid download_lipidbank_lipid_class request_lipidbank_lipid_class

R Package Documentation

Browse R Packages

We want your feedback!

tidymass/massdatabase
Play with databases about mass spectrometry

R/9_LIPIDBANK.R
In tidymass/massdatabase: Play with databases about mass spectrometry