R/leCsvAccess.R

Defines functions findMass findLevel findName findCAS findFormula findSmiles findRt findMz findMz.formula getAdductProperties getAdductInformation getMonoisotopicMass knownAdducts getMolecule .translateLevel resetList loadList

Documented in findCAS findFormula findLevel findMass findMz findMz.formula findName findRt findSmiles getMolecule loadList resetList

.listEnvEnv <- new.env()
assign("listEnv", NULL, envir=.listEnvEnv)

#' Load compound list for RMassBank
#' 
#' Loads a CSV compound list with compound IDs
#' 
#' The list is loaded into the variable \code{\var{compoundList}} in the environment
#' \code{listEnv} (which defaults to the global environment) and used by
#' the \code{findMz}, \code{findCAS}, ... functions. The CSV file is required to have at least the following columns, which are used for 
#' further processing and must be named correctly (but present in any order): \var{ID, Name, SMILES, RT,
#' CAS}
#' 
#' resetList() clears a currently loaded list.
#' 
#' @aliases loadList resetList
#' @usage loadList(path, listEnv=NULL, check=TRUE)
#' 
#' resetList()
#' @param path Path to the CSV list.
#' @param listEnv The environment to load the list into. By default, the namelist is loaded
#' 		into an environment internally in RMassBank. 
#' @param check A parameter that specifies whether the SMILES-Codes in the list should be checked for readability by rcdk.
#' @return No return value.
#' @author Michael Stravs
#' @seealso \code{\link{findMz}}
#' @examples
#' 
#' ##
#' \dontrun{loadList("mylist.csv")}
#' 
#' @export
loadList <- function(path, listEnv = NULL, check = TRUE)
{
  	if(is.null(listEnv))
  		listEnv <- .listEnvEnv
  	if(!file.exists(path))
  		stop("The supplied file does not exist, please supply a correct path")
          
    # Try out if the file is comma- or semicolon-separated
    compoundList <- read.csv(path, stringsAsFactors=FALSE, check.names=FALSE)
  	n <- colnames(compoundList)
    if(!("ID" %in% n)){ # If no ID column, it must be semicolon separated
        compoundList <- read.csv2(path, stringsAsFactors=FALSE, check.names=FALSE)
        n <- colnames(compoundList)
        if(!("ID" %in% n)){ # ...or there must be something wrong with the column names
             stop("There is no 'ID' column in the compound list")
        }
    }
    
    # Now everything should be fine, at least regarding csv and ssv
    
    # Are there duplicate compound IDs? 
    if(any(duplicated(compoundList$ID))){
        stop("Duplicate compound IDs are present. Please check the compound list.")
    }
    
    # Do all Compoundlist IDs have 4 characters or less?
    if(any(nchar(compoundList$ID) > 4)){
        stop("The maximum number of digits for compound IDs is 4")
    }
    
    # Evaluate if all strictly necessary columns are in the list
    cols <- c('ID', 'Name', 'SMILES', 'RT', 'CAS')
    d <- setdiff(cols, n)
    if(length(d)>0){
        stop(paste("Some columns are missing in the compound list. It needs at least", paste(cols,collapse=", ")))
    }
    
    # Is there a "Level" column?
    if("Level" %in% n){
        newList <- TRUE
    } else {
        newList <- FALSE
    }
    
    # Assign for now...
    assign("listEnv", listEnv, envir=.listEnvEnv) 
    .listEnvEnv$listEnv$compoundList <- compoundList
    # If "level" is in the compound list we have to check several things:
    
    if(newList){
        # a) Are the levels part of the defined levels?
        # b) Are the values ok for every level? (i.e. all necessary values supplied for each line in the compound list?)
        
        # Check a) (and translate to number levels because handling numbers and text would make the if-statements here even more confusing)
        
        compoundList$Level <- sapply(as.character(compoundList$Level),.translateLevel)
        
        # Need this variable for levels 3b, 3c, 3d, 4 and potentially 3 and 3a
        formulaColPresent <- "Formula" %in% n
        
        if(any(c("3b","3c","3d","4") %in% compoundList$Level)){
    
            if(!(formulaColPresent)){
                .listEnvEnv$listEnv$compoundList <- NULL
                stop('The compound list must contain the column "Formula" if a tentative or formula compound (levels 3b, 3c, 3d, 4) is present')
            }
        }
        # Need this variable for level 5
        if("5" %in% compoundList$Level){
            mzCol <- which(c("mz","m/z","mass","exactMass") %in% n)
            if(length(mzCol) > 1){
                .listEnvEnv$listEnv$compoundList <- NULL
                stop("The compound list can only contain one column with one of the following column names: 'mz','m/z','mass','exactMass' if an unknown compound (level 5) is present")
            }
        
            if(mzCol == 2){
                    n[which(n == "m/z")] <- "mz"
            }
            
            if(mzCol == 4){
                    n[which(n == "exactMass")] <- "mass"
            }
            .listEnvEnv$listEnv$mzCol <- c("mz","m/z","mass","exactMass")[mzCol]
            colnames(compoundList) <- n
        }
        # Check b) 
        for(i in 1:length(compoundList$Level)){
            level <- compoundList$Level[i] 
            # ID must have numbers as values
            if(!is.numeric(compoundList[i,"ID"])){
                .listEnvEnv$listEnv$compoundList <- NULL
                stop(paste("Value for 'ID' in line", i, "of the compound list is not a valid compound ID"))
            }
            
            # If level is "1x" or "2x", the SMILES must be supplied and must be correct
            if(level %in% c("0","1","1a","1b","1c","2","2a","2b")){
                currEnvir <- environment()
                
                tryCatch(
                    findMz(compoundList[i,"ID"]),
                    error = function(e){
                        .listEnvEnv$listEnv$compoundList <- NULL
                        stop(paste("'SMILES' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid SMILES"))
                    }
                )
            }
            
            # If level is "3" or "3a", a valid smiles or formula must be supplied
            if(level %in% c("3","3a")){
  
                if(!is.na(findSmiles(compoundList[i,"ID"]))){
                    tryCatch(
                        findMz(compoundList[i,"ID"]),
                        error = function(e){
                            .listEnvEnv$listEnv$compoundList <- NULL
                            stop(paste("'SMILES' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid SMILES"))
                        }
                    )
                } else{
                    if(!formulaColPresent){
                        .listEnvEnv$listEnv$compoundList <- NULL
                        stop(paste("Compound", compoundList[i,"ID"], "in line", i, "of the compound list is marked as level 3, but there is no valid SMILES
                            value nor a 'Formula' column present"))
                    }
                    
                    tryCatch(
                        rcdk::get.formula(findFormula(compoundList[i,"ID"], retrieval="tentative")),
                        error = function(e){
                            .listEnvEnv$listEnv$compoundList <- NULL
                            stop(paste("'Formula' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid Formula"))
                        }
                    )
                }
            }
            
            # If level is "3b","3c","3d" or "4", a valid formula must be supplied
            if(level %in% c("3b","3c","3d","4")){
                
                tryCatch(
                    rcdk::get.formula(findFormula(compoundList[i,"ID"], retrieval="tentative")),
                    error = function(e){
                        .listEnvEnv$listEnv$compoundList <- NULL
                        stop(paste("'Formula' value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid Formula"))
                    }
                )
            }
            # If level is "5", m/z must be supplied
           
            if(level == "5"){
                if(!is.numeric(findMz(compoundList[i,"ID"], retrieval="unknown")$mzCenter)){
                    .listEnvEnv$listEnv$compoundList <- NULL
                    stop(paste(c("mz","m/z","mass","exactMass")[mzCol],"value for compound", compoundList[i,"ID"] ,"in line", i, "of the compound list is not a valid number"))
                }
            }
        }
    }
    
    # If "Level" is not in the compound list it MUST be a standard list, so process just as before:
    if(!newList){
        cols <- c('ID', 'Name', 'SMILES', 'RT', 'CAS')
        d <- setdiff(cols, n)
        if(length(d)>0){
            stop("Some columns are missing in the compound list. Needs at least ID, Name, SMILES, RT, CAS.")
        }
  
        ###
        ###Test if all the IDs work
        ###
        
        if(check){
            wrongID <- vector()
            currEnvir <- environment()
            sapply(compoundList[,"ID"], function(x){
                tryCatch(
                    findMz(x),
                    error = function(e){
                      if(RMassBank.env$verbose.output)
                        cat(paste("### Warning ### Error finding SMILES for ID '", x, "': ", e, sep = ""))
                      
                      currEnvir$wrongID <- c(currEnvir$wrongID, x)
                    }
                )
            })
            if(length(wrongID)){
                .listEnvEnv$listEnv$compoundList <- NULL
                stop(paste("Unable to interpret the SMILES-strings for ID(s)", paste(wrongID, collapse= " "), "\nPlease check these and load the list again."))
            }
        }
        Level <- rep("0",nrow(compoundList))
        .listEnvEnv$listEnv$compoundList <- cbind(compoundList,Level)
    }
    message("Loaded compoundlist successfully")
}

#' @export
resetList <- function()
{
	if(is.null(.listEnvEnv$listEnv))
		return()
	if(!exists("compoundList", where=.listEnvEnv$listEnv))
		return()
	rm(.listEnvEnv$listEnv$compoundList)
	assign("listEnv", NULL, envir=.listEnvEnv)
}

# Function that translates one entry of the level column of the compound list into the number system
.translateLevel <- function(level){
    if(!(level %in% c("0","1","1a","1b","1c","2","2a","2b","3","3a","3b","3c","3d","4","5"))){
        switch(level,
            standard={
                return("1a")
            },
            parent={
                return("1b")
            },
            confirmed={
                return("1c")
            },
            probable={
                return("2")
            },
            probableLibrary={
                return("2a")
            },
            probableDiagnostic={
                return("2b")
            },
            tentative={
                return("3")
            },
            tentativeStructure={
                return("3a")
            },
            tentativeIsomer={
                return("3b")
            },
            tentativeTPClass={
                return("3c")
            },
            tentativeBestMatch={
                return("3d")
            },
            formula={
                return("4")
            },
            unknown={
                return("5")
            },
            exactMass={
                return("5")
            },
            {
                stop(paste(level, "is not a valid level"))
            }
        )
    } else{
        return(level)
    }
}

#' Create Rcdk molecule from SMILES
#' 
#' Generates a Rcdk molecule object from SMILES code, which is fully typed and
#' usable (in contrast to the built-in \code{parse.smiles}).
#' 
#' \bold{NOTE: As of today (2012-03-16), Rcdk discards stereochemistry when
#' loading the SMILES code!} Therefore, do not trust this function blindly,
#' e.g.  don't generate InChI keys from the result. It is, however, useful if
#' you want to compute the mass (or something else) with Rcdk.
#' 
#' @usage getMolecule(smiles)
#' @param smiles The SMILES code of the compound.
#' @return A Rcdk \code{IAtomContainer} reference.
#' @author Michael Stravs
#' @seealso \code{\link{parse.smiles}}
#' @examples
#' 
#' # Lindane:
#' getMolecule("C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)Cl")
#' # Benzene:
#' getMolecule("C1=CC=CC=C1")
#' 
#' @export
getMolecule <- function(smiles)
{
  capture.output(mol <- parse.smiles(smiles)[[1]])
  do.aromaticity(mol)
  convert.implicit.to.explicit(mol)
  do.aromaticity(mol)
  do.typing(mol)
  do.isotopes(mol)
 
  return(mol)
}

knownAdducts <- function(){
  return(getAdductInformation("")$mode)
}
getMonoisotopicMass <- function(formula){
  if(!exists("isotopes")) data("isotopes", package = "enviPat")
  
  if(formula == "") return(0)
  
  if(grepl(x = formula, pattern = "-")){
    starts <- gregexpr(text = formula, pattern = "[A-Z]")[[1]]
    subFormulas <- sapply(X = seq_along(starts), FUN = function(startIdx){
      ifelse(
        test = startIdx < length(starts), 
        yes = substr(x = formula, start = starts[[startIdx]], stop = starts[[startIdx + 1]] - 1), 
        no  = substr(x = formula, start = starts[[startIdx]], stop = nchar(formula))
      )
    })
    
    monoisotopicMass <- sum(sapply(X = subFormulas, FUN = function(subFormula){
      ifelse(
        test = grepl(x = subFormula, pattern = "-"), 
        yes = -enviPat::isopattern(isotopes = isotopes, chemforms = gsub(x = subFormula, pattern = "-", replacement = ""), threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]], 
        no  =  enviPat::isopattern(isotopes = isotopes, chemforms = subFormula,                                            threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]]
      )
    }))
  } else {
    monoisotopicMass <- enviPat::isopattern(isotopes = isotopes, chemforms = formula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]]
  }
  return(monoisotopicMass)
}
getAdductInformation <- function(formula){
  adductDf <- as.data.frame(rbind(
    
    ## positive: M+X
    c(mode = "pH",       addition = "H",         charge = 1, adductString = "[M+H]+"),
    c(mode = "pLi",      addition = "Li",        charge = 1, adductString = "[M+Li]+"),
    c(mode = "pNa",      addition = "Na",        charge = 1, adductString = "[M+Na]+"),
    c(mode = "pNa_mO3S_mH",      addition = "Na1O-3S-1H-1",        charge = 1, adductString = "[M-O3S-H+Na]+"),
    c(mode = "pK",       addition = "K",         charge = 1, adductString = "[M+K]+"),
    c(mode = "pM",       addition = "",          charge = 1, adductString = "[M]+"),
    c(mode = "pM_mC7H11NO9S2",       addition = "C-7H-11NO-9S-2",          charge = 1, adductString = "[M-C7H11NO9S2]+"),
    c(mode = "pM_mC7H12NO9S2",       addition = "C-7H-12NO-9S-2",          charge = 1, adductString = "[M-C7H12NO9S2]+"),
    c(mode = "pNH4",     addition = "NH4",       charge = 1, adductString = "[M+NH4]+"),
    c(mode = "p2Na_mH",  addition = "Na2H-1",    charge = 1, adductString = "[M+2Na-H]+"),
    c(mode = "pACN_pH",  addition = "C2H4N1",    charge = 1, adductString = "[M+ACN+H]+"),
    c(mode = "pACN_pNa", addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+"),
    c(mode = "pH_mC7H6O",     addition = "C-7H-5O-1", charge = 1, adductString = "[M-C7H6O+H]+"),
    c(mode = "pH_mC18H30O14", addition = "C-18H-29O-14", charge = 1, adductString = "[M-C18H30O14+H]+"),
    c(mode = "pH_mC6H10O5",   addition = "C-6H-9O-5", charge = 1, adductString = "[M-C6H10O5+H]+"),
    c(mode = "pH_mC12H20O9",  addition = "C-12H-19O-9", charge = 1, adductString = "[M-C12H20O9+H]+"),
    c(mode = "pH_mC9H8O4_mH2O",  addition = "C-9H-9O-5", charge = 1, adductString = "[M-C9H8O4-H2O+H]+"),
    c(mode = "pH_mC6H10O5_mH2O", addition = "C-6H-11O-6", charge = 1, adductString = "[M-C6H10O5-H2O+H]+"),
    c(mode = "pH_mC5H8NO4",   addition = "C-5H-7N-1O-4", charge = 1, adductString = "[M-C5H8NO4+H]+"),
    c(mode = "pH_mO3S",       addition = "O-3S-1H1", charge = 1, adductString = "[M-O3S+H]+"),
    c(mode = "pH_mC6H10O8S",  addition = "C-6H-9O-8S-1", charge = 1, adductString = "[M-C6H10O8S+H]+"),
    c(mode = "pH_mC5H10N2O",  addition = "C-5H-9N-2O-1", charge = 1, adductString = "[M-C5H10N2O+H]+"),
    c(mode = "pH_mHO3P",      addition = "O-3P-1", charge = 1, adductString = "[M-HO3P+H]+"),
    c(mode = "pH_mC4H7",      addition = "C-4H-6", charge = 1, adductString = "[M-C4H7+H]+"),
    c(mode = "pH_mC6H10O4",   addition = "C-6H-9O-4", charge = 1, adductString = "[M-C6H10O4+H]+"),
    c(mode = "pH_mC5H8O3",    addition = "C-5H-7O-3", charge = 1, adductString = "[M-C5H8O3+H]+"),
    c(mode = "pH_mCO",        addition = "H1C-1O-1", charge = 1, adductString = "[M-CO+H]+"),
    c(mode = "p_mCO",         addition = "C-1O-1", charge = 1, adductString = "[M-CO]+"),
    c(mode = "pH_mO3",        addition = "H-1O-3", charge = 1, adductString = "[M-O3+H]+"),
    c(mode = "pH_mC3H6",      addition = "C-3H-5", charge = 1, adductString = "[M-C3H6+H]+"),
    c(mode = "pH_mC4H3O5",    addition = "C-4H-2O-5", charge = 1, adductString = "[M-C4H3O5+H]+"),
    c(mode = "pH_mC6H11O6",   addition = "C-6H-10O-6", charge = 1, adductString = "[M-C6H11O6+H]+"),
    c(mode = "pH_mCH4S",     addition = "C-1H-3S-1", charge = 1, adductString = "[M-CH4S+H]+"),
    c(mode = "pH_mC7H12O6",  addition = "C-7H-11O-6", charge = 1, adductString = "[M-C7H12O6+H]+"),
    c(mode = "pH_mCH4O",     addition = "C-1H-3O-1", charge = 1, adductString = "[M-CH4O+H]+"),
    c(mode = "pH_mCH2O2",    addition = "C-1H-1O-2", charge = 1, adductString = "[M-CH2O2+H]+"),
    c(mode = "pH_mC4H8",     addition = "C-4H-7", charge = 1, adductString = "[M-C4H8+H]+"),
    c(mode = "pH_mC3H6O",    addition = "C-3H-5O-1", charge = 1, adductString = "[M-C3H6O+H]+"),
    c(mode = "pH_mC8H18O2",  addition = "C-8H-17O-2", charge = 1, adductString = "[M-C8H18O2+H]+"),
    c(mode = "pH_mC6H14O2",  addition = "C-6H-13O-2", charge = 1, adductString = "[M-C6H14O2+H]+"),
    c(mode = "pH_mC4H12O2",  addition = "C-4H-11O-2", charge = 1, adductString = "[M-C4H12O2+H]+"),
    c(mode = "pH_mH2O",  addition = "H-1O-1",    charge = 2, adductString = "[M-H2O+H]+"),
    c(mode = "pNa_mH2O", addition = "H-2O-1Na1", charge = 2, adductString = "[M-H2O+Na]+"),
    c(mode = "pH_mCO2",  addition = "C-1O-2H1", charge = 1, adductString = "[M-CO2+H]+"),
    c(mode = "pH_mO",  addition = "O-1H1", charge = 1, adductString = "[M-O+H]+"),
    c(mode = "p_mO",  addition = "O-1", charge = 1, adductString = "[M-O]+"),
    c(mode = "p2H",      addition = "H2",        charge = 2, adductString = "[M+2H]2+"),
    c(mode = "pACN_p2H", addition = "C2H5N1",    charge = 2, adductString = "[M+ACN+2H]2+"),
    ## positive: 2M+X
    c(mode = "pM_pH",      addition = add.formula(formula, "H1"),     charge = 1, adductString = "[2M+H]+"),
    c(mode = "pM_pK",      addition = add.formula(formula, "K1"),     charge = 1, adductString = "[2M+K]+"),
    c(mode = "pM_pNa",     addition = add.formula(formula, "Na1"),    charge = 1, adductString = "[2M+Na]+"),
    c(mode = "pM_pNH4",    addition = add.formula(formula, "N1H4"),   charge = 1, adductString = "[2M+NH4]+"),
    c(mode = "pM_pACN_pH", addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+"),
    ## positive: strange positive adducts
    c(mode = "pCOONa",         addition = "C1O2Na1", charge =  1, adductString = "[M+COONa]+"),
    c(mode = "p3H_c1",         addition = "H3",      charge =  1, adductString = "[M+3H]+"),
    c(mode = "pH2O_c1",        addition = "H2O1",    charge =  1, adductString = "[M+H2O]+"),
    c(mode = "pH_m2H2O",       addition = "H-3O-2",  charge =  1, adductString = "[M-2H2O+H]+"),
    c(mode = "pH_pH2O",        addition = "H3O1",    charge =  1, adductString = "[M+H2O+H]+"),
    c(mode = "pH_mNH3",        addition = "N-1H-2",  charge =  1, adductString = "[M-NH3+H]+"),
    c(mode = "p2H_c1",         addition = "H2",      charge =  1, adductString = "[M+2H]+"),
    c(mode = "p_mNH3_c1",      addition = "N-1H-3",  charge =  1, adductString = "[M-NH2-H]+"),
    c(mode = "p_mNH2_pH_c1",   addition = "N-1H-1",  charge =  1, adductString = "[M-NH2+H]+"),
    c(mode = "pM_p2Na_m3H_c1", addition = add.formula(formula, "Na2H-3"), charge =  1, adductString = "[2M+2Na-3H]+"),
    c(mode = "pM_pNa_m2H_c1",  addition = add.formula(formula, "Na1H-2"), charge =  1, adductString = "[2M+Na-2H]+"),
    c(mode = "pM_pNa_mH_c1",   addition = add.formula(formula, "Na1H-1"), charge =  1, adductString = "[2M+Na-H]+"),
    c(mode = "pM_p2Na_m2H_c1", addition = add.formula(formula, "Na2H-2"), charge =  1, adductString = "[2M+2Na-2H]+"),
    c(mode = "pM_pH_m2H2O_c1", addition = add.formula(formula, "H-3O-2"), charge =  1, adductString = "[2M-2H2O+H]+"),
    c(mode = "pM_pH_mH2O",     addition = add.formula(formula, "H-1O-1"), charge =  1, adductString = "[2M-H2O+H]+"),
    c(mode = "pM_pNa_mH2O",    addition = add.formula(formula, "H-2O-1Na1"), charge =  1, adductString = "[2M-H2O+Na]+"),
    c(mode = "pM_m2H_c1",      addition = add.formula(formula, "H-2"),    charge =  1, adductString = "[2M-2H]+"),
    c(mode = "pM_mH_c2",       addition = add.formula(formula, "H-1"),    charge =  1, adductString = "[2M-2H+H]+"),
    c(mode = "pM_pLi",         addition = add.formula(formula, "Li1"),    charge =  1, adductString = "[2M+Li]+"),
    c(mode = "pM_pH_m2O",      addition = add.formula(formula, "O-2H1"),  charge =  1, adductString = "[2M-2O+H]+"),
    c(mode = "pM_pNa_m2O",     addition = add.formula(formula, "O-2Na1"), charge =  1, adductString = "[2M-2O+Na]+"),
    c(mode = "pM_pH_m3O",      addition = add.formula(formula, "O-3H1"),  charge =  1, adductString = "[2M-3O+H]+"),
    c(mode = "pM_pNa_m3O",     addition = add.formula(formula, "O-3Na1"), charge =  1, adductString = "[2M-3O+Na]+"),
    c(mode = "pM_mH_c1",       addition = add.formula(formula, "H-1"),    charge =  1, adductString = "[2M-H]+"),
    c(mode = "p_p2M_m3H",  addition = add.formula(formula, add.formula(formula, "H-3")), charge = 1, adductString = "[3M-3H]+"),
    c(mode = "pH_p2M_m2H2O",  addition = add.formula(formula, add.formula(formula, "H-3O-2")), charge = 1, adductString = "[3M-2H2O+H]+"),
    c(mode = "pNa_p2M_m2H2O",  addition = add.formula(formula, add.formula(formula, "H-4O-2Na1")), charge = 1, adductString = "[3M-2H2O+Na]+"),
    c(mode = "p_p2M_m2H2O",  addition = add.formula(formula, add.formula(formula, "H-4O-2")), charge = 1, adductString = "[3M-2H2O]+"),
    c(mode = "pM_mH_pH",       addition = formula,   charge =  1, adductString = "[2M-H+H]+"),
    c(mode = "pH_c2",          addition = "H1",      charge =  2, adductString = "[M+H]2+"),
    
    
    ## negative: M-X
    c(mode = "mH",      addition = "H-1",    charge = -1, adductString = "[M-H]-"),
    c(mode = "mCl",     addition = "Cl1",    charge = -1, adductString = "[M+Cl]-"),
    c(mode = "mFA",     addition = "C1O2H",  charge = -1, adductString = "[M+HCOOH-H]-"),
    c(mode = "mH_pTFA", addition = "C2F3O2", charge = -1, adductString = "[M+CF3CO2H-H]-"),
    
    c(mode = "mH_mC6H10O5", addition = "C-6H-11O-5", charge = -1, adductString = "[M-C6H10O5-H]-"),
    
    c(mode = "mFA_pH",  addition = "C1O2H2", charge = -1, adductString = "[M+HCOOH]-"),
    c(mode = "mH_mH2O", addition = "H-3O-1", charge = -1, adductString = "[M-H2O-H]-"),
    c(mode = "mCO2",    addition = "C-1O-2", charge = -1, adductString = "[M-CO2]-"),
    c(mode = "mH_mCH3", addition = "C-1H-4", charge = -1, adductString = "[M-CH3-H]-"),
    c(mode = "mH_mCO2", addition = "C-1H-1O-2", charge = -1, adductString = "[M-CO2-H]-"),
    c(mode = "mCH3",    addition = "C-1H-3", charge = -1, adductString = "[M-CH3]-"),
    c(mode = "m2H_pNa", addition = "H-2Na1", charge = -1, adductString = "[M+Na-2H]-"),
    c(mode = "mM",      addition = "",       charge = -1, adductString = "[M]-"),
    c(mode = "m2H",     addition = "H-2",    charge = -1, adductString = "[M-2H]-"), ## in case of positively charged compounds
    c(mode = "m2H_c2",  addition = "H-2",    charge = -2, adductString = "[M-2H]2-"),
    ## negative: 2M-X
    c(mode = "mH_pM",      addition = add.formula(formula, "H-1"),    charge = -1, adductString = "[2M-H]-"),
    c(mode = "mFA_pM",     addition = add.formula(formula, "C1O2H"),  charge = -1, adductString = "[2M+HCOOH-H]-"),
    c(mode = "mH_pM_mH2O", addition = add.formula(formula, "H-3O-1"), charge = -1, adductString = "[2M-H2O-H]-"),
    c(mode = "m2H_pM_pNa", addition = add.formula(formula, "H-2Na1"), charge = -1, adductString = "[2M+Na-2H]-"),
    ## negative: strange adducts
    c(mode = "mpM",            addition = formula,    charge = -1, adductString = "[2M]-"),
    c(mode = "m2H_pHCOOH_pNa", addition = "Na1C1O2",  charge = -1, adductString = "[M+HCOOH+Na-2H]-"),
    c(mode = "mH_p2H",         addition = "H2",       charge = -1, adductString = "[M+3H-H]-"),
    c(mode = "mH_pH",          addition = "H1",       charge = -1, adductString = "[M+2H-H]-"),
    c(mode = "mH_pH2O",        addition = "H1O1",     charge = -1, adductString = "[M+H2O-H]-"),
    c(mode = "m4H_pM_p3Na",    addition = add.formula(formula, "Na3H-4"),    charge = -1, adductString = "[2M+3Na-4H]-"),
    c(mode = "m2H_mNH3_pNa",   addition = add.formula(formula, "Na1N-1H-5"), charge = -1, adductString = "[2M-NH3+Na-2H]-"),
    c(mode = "m3H_pM_p2Na",    addition = add.formula(formula, "Na2H-3"),    charge = -1, adductString = "[2M+2Na-3H]-"),
    c(mode = "m3H_pM",         addition = add.formula(formula, "H-3"),       charge = -1, adductString = "[2M-3H]-"),
    c(mode = "mH_p2M",         addition = add.formula(formula, add.formula(formula, "H-1")), charge = -1, adductString = "[3M-H]-"),
    
    ## ???
    c(mode = "",        addition = "",       charge = 0,  adductString = "[M]")
  ), stringsAsFactors = F)
  adductDf$charge <- as.integer(adductDf$charge)
  
  if(any(any(duplicated(adductDf$mode)), any(duplicated(adductDf$adductString)))) stop("Invalid adduct table")
  
  return(adductDf)
}
getAdductProperties <- function(mode, formula){
  if(grepl(x = mode, pattern = "pM") & is.null(formula))
    stop("Cannot calculate pM adduct without formula")
  else if(is.null(formula)) formula <- ""
  
  adductDf <- getAdductInformation(formula)
  
  if(!(mode %in% adductDf$mode))
    stop("mode = \"", mode, "\" not defined")
  
  mzopt <- as.list(adductDf[adductDf$mode==mode,])
  return(mzopt)
}

#' Find the exact mass +/- a given margin for a given formula or its ions and adducts.
#' 
#' @param formula The molecular formula  in text or list format (see \code{\link{formulastring.to.list}}
#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions 
#' 			([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). "" for the uncharged molecule.
#' @param ppm The ppm margin to add/subtract
#' @param deltaMz The absolute mass to add/subtract. Cumulative with \code{ppm}
#' @return A \code{list(mzMin=, mzCenter=, mzMax=)} with the masses.
#' @author Michael A. Stravs, Eawag <michael.stravs@@eawag.ch>
#' @examples findMz.formula("C6H6")
#' @seealso \code{\link{findMz}}
#' @export
findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) 
{
	if (!any(mode %in% knownAdducts())) 
		stop(paste("The ionization mode", mode, "is unknown."))
  mzopt <- getAdductProperties(mode, formula)
	formula <- add.formula(formula, mzopt$addition)
	# Since in special cases we want to use this with negative and zero number of atoms, we account for this case
	# by splitting up the formula into positive and negative atom counts (this eliminates the zeroes.)
	formula.split <- split.formula.posneg(formula)
	m <- 0
	if(formula.split$pos != "")
	{
		formula.pos <- get.formula(formula.split$pos, charge = mzopt$charge)
		m = m + formula.pos@mass
	}
	if(formula.split$neg != "")
	{
		formula.neg <- get.formula(formula.split$neg, charge = -mzopt$charge)
		m = m - formula.neg@mass
	}
	if((nchar(formula.split$pos)==0) & (nchar(formula.split$neg)==0))
	{
		m <- get.formula("H", charge = mzopt$charge)@mass - get.formula("H", charge = 0)@mass
	}
	delta <- ppm(m, ppm, l = TRUE)
	return(list(mzMin = delta[[2]] - deltaMz, mzMax = delta[[1]] + 
							deltaMz, mzCenter = m))
}

#' Find compound information
#' 
#' Retrieves compound information from the loaded compound list or calculates
#' it from the SMILES code in the list.
#' 
#' @aliases findMz findSmiles findFormula findRt findCAS findName findLevel
#' @usage  findMz(cpdID, mode = "pH", ppm = 10, deltaMz = 0, retrieval="standard")
#' 
#' findRt(cpdID) 
#' 
#' findSmiles(cpdID) 
#' 
#' findFormula(cpdID, retrieval="standard") 
#' 
#' findCAS(cpdID)
#' 
#' findName(cpdID)
#'
#' findLevel(cpdID, compact=FALSE)
#' @param cpdID The compound ID in the compound list.
#' @param mode Specifies the species of the molecule: An empty string specifies
#' uncharged monoisotopic mass, \code{\var{pH}} (positive H) specifies [M+H]+,
#' \code{\var{pNa}} specifies [M+Na]+, \code{\var{pM}} specifies [M]+,
#' \code{\var{mH}} and \code{\var{mFA}} specify [M-H]- and [M+FA]-,
#' respectively. (I apologize for the naming of \code{\var{pH}} which has
#' absolutely nothing to do with chemical \emph{pH} values.)
#' @param ppm Specifies ppm window (10 ppm will return the range of the
#' molecular mass + and - 10 ppm).
#' @param deltaMz Specifies additional m/z window to add to the range (deltaMz
#' = 0.02 will return the range of the molecular mass +- 0.02 (and additionally
#' +- the set ppm value).
#' @param retrieval A value that determines whether the files should be handled either as "standard",
#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown"
#' if the only know thing is the m/z
#' @param compact Only for \code{findLevel}, returns the "retrieval" parameter used for many functions 
#' within RMassBank if TRUE
#' @return \code{findMz} will return a \code{list(mzCenter=, mzMin=, mzMax=)}
#' with the molecular weight of the given ion, as calculated from the SMILES
#' code and Rcdk.
#' 
#' \code{findRt}, \code{findSmiles},\code{findCAS},\code{findName} will return
#' the corresponding entry from the compound list. \code{findFormula} returns
#' the molecular formula as determined from the SMILES code.
#' 
#' @author Michael Stravs
#' @seealso \code{\link{findMass}}, \code{\link{loadList}}, \code{\link{findMz.formula}}
#' @examples
#' 
#' \dontrun{%
#' 	findMz(123, "pH", 5)
#' 	findFormula(123)
#' }
#' 
#' @export
findMz <- function(cpdID, mode="pH", ppm=10, deltaMz=0, retrieval="standard",
                   unknownMass = getOption("RMassBank")$unknownMass)
{
  if(is.null(unknownMass))
    unknownMass = "charged"
  if(!(unknownMass %in% c("charged", "neutral")))
     stop("unknownMass definition must be either 'charged' or 'neutral', default is 'charged'")
     
    # In case of unknown: m/z is in table
    if(retrieval == "unknown"){
        mz <- .listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),.listEnvEnv$listEnv$mzCol]
        delta <- ppm(mz, ppm, l=TRUE)
        if(unknownMass == "neutral")
          dmass <- findMz.formula("", mode=mode)$mzCenter
        else
          dmass <- 0
        return(list(mzMin=delta[2] - deltaMz + dmass, mzMax=delta[1] + deltaMz + dmass, mzCenter=mz + dmass))
    } 
    
    # In case of tentative: calculate mass from formula
    if(retrieval == "tentative"){
        return(findMz.formula(findFormula(cpdID, "tentative"),mode=mode))
    }
    
    # All other cases: Use smiles to calculate mass
    if(retrieval == "standard"){
        s <- findSmiles(cpdID)
        #if(s=="-") s <- NA
        if(is.na(s)){
            stop("There was no matching SMILES-entry to the supplied cpdID(s) \n  Please check the cpdIDs and the compoundlist.")
            return(list(mzMin=NA,mzMax=NA,mzCenter=NA))
        }
        formula <- .get.mol2formula(getMolecule(s))
        return(findMz.formula(formula@string, mode, ppm, deltaMz))
    }
}

#findMz <- function(...)	findMz.rcdk(...)
#' @export
findRt <- function(cpdID) {
	if(is.null(.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	if(!exists("compoundList", where=.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	if(is.character(cpdID))
		cpdID <- as.numeric(cpdID)
	rt <- as.numeric(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"RT"])
	if(!is.null(getOption("RMassBank")$rtShift))
		rt <- rt + getOption("RMassBank")$rtShift
	if(is.na(rt)) return(list(RT=NA))
	else return(list(RT=rt))
}

#' @export
findSmiles <- function(cpdID) {
	if(is.character(cpdID))
		cpdID <- as.numeric(cpdID)
	if(is.null(.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	if(!exists("compoundList", where=.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
  if(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"] == "")
    return(NA)
	return(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"])
}

#' @export
findFormula <- function(cpdID, retrieval = "standard") {
    
    # In case of tentative: read formula from table
    if(retrieval=="tentative"){
        return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Formula"])
    }
    
    # Otherwise: Convert smiles to formula
    if(retrieval=="standard"){
        smiles <- findSmiles(cpdID)
        mol <- getMolecule(smiles)
        f <- .get.mol2formula(mol)
        return(f@string)
    }
}

#' @export
findCAS <- function(cpdID) {
	if(is.character(cpdID))
		cpdID <- as.numeric(cpdID)
	if(is.null(.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	if(!exists("compoundList", where=.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"CAS"])
}

#' @export
findName <- function(cpdID) {
	if(is.character(cpdID))
		cpdID <- as.numeric(cpdID)
	if(is.null(.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	if(!exists("compoundList", where=.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Name"])
}

#' @export
findLevel <- function(cpdID, compact=FALSE) {
	if(is.character(cpdID))
		cpdID <- as.numeric(cpdID)
	if(is.null(.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
	if(!exists("compoundList", where=.listEnvEnv$listEnv))
		stop("Compound list must be loaded first.")
    if(compact){
        level <- .listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Level"]
        if(level %in% c("0","1","1a","1b","1c","2","2a","2b","3","3a")){
            return("standard")
        }
        if(level %in% c("3b","3c","3d","4")){
            return("tentative")
        }
        if(level == "5"){
            return("unknown")
        }
        
        return("tentative")
    }
	return(.listEnvEnv$listEnv$compoundList[which(.listEnvEnv$listEnv$compoundList$ID == cpdID),"Level"])
}

#' Calculate exact mass
#' 
#' Retrieves the exact mass of the uncharged molecule. It works directly from
#' the SMILES and therefore is used in the MassBank workflow
#' (\code{\link{mbWorkflow}}) - there, all properties are calculated from the
#' SMILES code retrieved from the database. (Alternatively, takes also the
#' compound ID as parameter and looks it up.) Calculation relies on Rcdk.
#' 
#' @param cpdID_or_smiles SMILES code or compound ID of the molecule. (Numerics
#' are treated as compound ID).
#' @param retrieval A value that determines whether the files should be handled either as "standard",
#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown"
#' if the only know thing is the m/z
#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions 
#' 			([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-).
#' Only needed for retrieval="unknown"
#' @return  Returns the exact mass of the uncharged molecule.
#' @author Michael Stravs
#' @seealso \code{\link{findMz}}
#' @examples
#' 
#' ##
#' findMass("OC[C@@H]1OC(O)[C@@H](O)[C@@@@H](O)[C@@@@H]1O")
#' 
#' @export
findMass <- function(cpdID_or_smiles, retrieval="standard", mode = "pH")
{
    # Must calculate mass manually if no formula is given
    if(retrieval == "unknown"){
        adductProperties <- getAdductProperties(mode, rcdk::get.formula(findFormula(cpdID_or_smiles)))
        allowed_additions <- adductProperties$addition
        mode.charge <- adductProperties$charge
        mass <- getMonoisotopicMass(allowed_additions)
        return(findMz(cpdID_or_smiles, mode=mode, retrieval=retrieval)$mzCenter - mass + mode.charge * .emass)
    }
    
    # Calculate mass from formula
    if(retrieval == "tentative"){
        return(get.formula(findFormula(cpdID_or_smiles, "tentative"))@mass)
    }
    
    # Calculate mass from SMILES
    if(retrieval == "standard"){
        if(!is.numeric(cpdID_or_smiles))
            s <- cpdID_or_smiles
        else
            s <- findSmiles(cpdID_or_smiles)
        mol <- getMolecule(s)
        return(get.exact.mass(mol))
    }
}
sneumann/RMassBank documentation built on Oct. 20, 2020, 3:19 p.m.