R/download.utilities.R

Defines functions findPathways filter.pathways.by.org match.names find.pathways.by.keywords downloadSbgnFile sbgn.gsets load.all.ids.mapping load.id.mapping.list.all generate.cpd.mapping.table get.keggrest.data mapping.ko.to.arbitrary.id.type generate.ko.mapping.list geneannot.map.ko loadMappingTable search.sbgnhub.id.mapping download.mapping.file

Documented in downloadSbgnFile findPathways loadMappingTable sbgn.gsets

#########################################################################################################
# searches for different id mapping file name combinations in local, SBGNview.data, SBGNhub locations
# check local directory for any mapping files, if not found then check SBGNView.data, then SBGNhub
# returns list containing the mapping file name and the location where its found
download.mapping.file <- function(input.type, output.type, species = NULL, 
                                  SBGNview.data.folder = "./SBGNview.tmp.data") { 
  
  # R CMD check will give different sort result if you didn't specify 'method': the
  # default sort method depends on the locale of your system. And R CMD check uses
  # a different locale than my interactive session.  The issue resided in this: R
  # interactively used:LC_COLLATE=en_US.UTF-8; R CMD check used: LC_COLLATE=C;
  # https://stackoverflow.com/questions/42272119/r-cmd-check-fails-devtoolstest-works-fine
  type.pair.name.1 <- paste(sort(c(input.type, output.type), method = "radix", 
                                 decreasing = FALSE), collapse = "_")
  type.pair.name.2 <- paste(sort(c(input.type, output.type), method = "radix", 
                                 decreasing = TRUE), collapse = "_")
  type.pair.name.1.org <- paste(species, type.pair.name.1, sep = "_")
  type.pair.name.2.org <- paste(species, type.pair.name.2, sep = "_")
  try.file.names <- c(type.pair.name.1.org, type.pair.name.2.org, type.pair.name.1, 
                      type.pair.name.2)
  try.file.names <- gsub("[[:space:]]", "", try.file.names) # remove any spaces
  
  # KO_ENTREZID.RData in SBGNhub may not contain the all the information for the speices in it. 
  # For case when input/output is KO/ENTREZID, try.file.names contains KO_ENTREZID,
  # so if species specific file not found, KO_ENTREZID will be downloaded and used instead 
  # of generating the species specific mapping between KO and ENTREZID. Modify 'try.files.names'
  # to search only for species specific mapping b/w KO and ENTREZID so if file not found, 
  # mapping is generated from scratch using KEGGREST.
  if(!is.null(species) & any(c(input.type, output.type) %in% c("KO", "ko")) &
     any(c(input.type, output.type) %in% c("ENTREZID"))) {
    try.file.names <- c(type.pair.name.1.org, type.pair.name.2.org)
  }
  
  location <- ""
  mapping.file.name <- ""
  
  for(try.file.name in try.file.names) { # for each try.file.name -> check local, data, hub if not exists()
    
    # try.file.name already exists in environment
    if(exists(try.file.name)) {
      location <- "exists"
      mapping.file.name <- try.file.name
      break
      
    } else { # check local, SBGNview.data, or hub
      
      file.name.rdata <- paste(try.file.name, ".RData", sep = "")
      # check local
      if(file.exists(paste(SBGNview.data.folder, file.name.rdata, sep = "/"))) {
        location <- "local"
        mapping.file.name <- file.name.rdata
        message("\nLocal mapping table file found: ", mapping.file.name)
        (break)()
      }
      
      # check SBGNview.data
      if.exists.SBGNview.data <- tryCatch(data(list = try.file.name), warning = function(w) "none")
      if(if.exists.SBGNview.data != "none") { # found in SBGNview.data
        location <- "SBGNview.data"
        mapping.file.name <- try.file.name
        message("\n", if.exists.SBGNview.data, " mapping table loaded from SBGNview.data")
        break
      }
      
      # check SBGNhub for .RData file
      if(location == "") {
        mapping.file.name <- search.sbgnhub.id.mapping(try.file.name = file.name.rdata, SBGNview.data.folder)
        if(!is.null(mapping.file.name)) { # found in SBGNhub
          location <- "SBGNhub"
          message(mapping.file.name, " downloaded from SBGNhub")
          message("saved to ", SBGNview.data.folder)
          break
        } 
      }
      
    } # end else block
  } # end for loop
  
  if(location == "" & is.null(mapping.file.name)) { # mapping table not found in local, SBGNview.data, SBGNhub
    location <- "pathview"
    mapping.file.name <- ""
  }
  
  mapping.file.info <- list(location = location, mapping.file.name = mapping.file.name)
  return(mapping.file.info)
}

#########################################################################################################
# used by download.mapping.file function
# search SBGNhub.id.mapping.tables, if file exits download from SBGNhub
search.sbgnhub.id.mapping <- function(try.file.name, SBGNview.data.folder) {
  
  mapping.file.name <- NULL
  if(!exists("SBGNhub.id.mapping.tables")) data(SBGNhub.id.mapping.tables, package="SBGNview")
  if(try.file.name %in% SBGNhub.id.mapping.tables) {
    message("\n", try.file.name, " found in SBGNhub")
    file.url <- paste("https://raw.githubusercontent.com/datapplab/SBGNhub/master/data/id.mapping.unique.pair.name/", 
                      try.file.name, sep = "")
    file.destination <- paste(SBGNview.data.folder, try.file.name, sep = "/")
    download.file(file.url, destfile = file.destination, method = "auto", mode = "wb")
    mapping.file.name <- try.file.name
  }
  return(mapping.file.name)
}

#########################################################################################################
#' Generate ID mapping table from input and output ID types 
#' 
#' This function generates the ID mapping table between input and output ID types. If provided a vector of input IDs (limit.to.ids argument), the function will output mapping table only containing the input IDs. Otherwise, the function will output all IDs of input and output types (restricted to a species if working on gene types and specified  the 'species' parameter).
#' This function will check if a mapping table exists in local folder ('SBGNview.data.folder' argument), SBGNview.data package, and SBGNhub. If no mapping table is found in these locations, a mapping table is generated using Pathview and saved to path specified by 'SBGNview.data.folder' argument.
#' 
#' @param input.type A character string. Gene or compound ID type
#' @param output.type A character string. Gene or compound ID type
#' @param species A character string. Three letter KEGG species code.
#' @param mol.type A character string. Either 'gene' or 'cpd'.
#' @param limit.to.ids Vector. Molecule IDs of 'input.type'.
#' @param SBGNview.data.folder A character string. Default: "./SBGNview.tmp.data". The path to a folder that will hold downloaded ID mapping files and pathway information data files.
#' @return A list containing the mapping table. 
#' @examples 
#'  data(mapped.ids)
#'  entrez.to.pathwayCommons <- loadMappingTable(input.type = 'ENTREZID',
#'                                               output.type = 'pathwayCommons',
#'                                               species = 'hsa',
#'                                               mol.type = 'gene')
#'                              
#' @export

loadMappingTable <- function(input.type = NULL, output.type = NULL, species = NULL, mol.type = NULL, 
                             limit.to.ids = NULL, SBGNview.data.folder = "./SBGNview.tmp.data") { 
  
  # input validation 
  if(is.null(input.type) | is.null(output.type)) {
    stop("Please make sure both input.type and output.type arguments are specified.") 
  }
  if(input.type %in% c("entrez", "eg", "entrezid", "ENTREZ")) input.type = "ENTREZID"
  if(output.type %in% c("entrez", "eg", "entrezid", "ENTREZ")) output.type = "ENTREZID"
  # changed mapping file names from CompondName to compound.name and kegg.ligand to kegg
  # we handle if input/output is CompoundName/kegg.ligand/KEGG
  if(input.type == "CompoundName") input.type <- "compound.name"
  if(input.type %in% c("kegg.ligand", "KEGG")) input.type <- "kegg"
  if(output.type == "CompoundName") output.type <- "compound.name"
  if(output.type %in% c("kegg.ligand", "KEGG")) output.type <- "kegg"
  
  if(input.type == output.type) {
    stop("Input type and output types are the same!")
  }
  species = gsub("Hs","hsa",species)
  
  if(is.null(mol.type) | !mol.type %in% c("gene", "cpd")) { 
    stop("'mol.type' argument must be 'gene' or 'cpd'")
  }
  
  if(!file.exists(SBGNview.data.folder)) { 
    dir.create(SBGNview.data.folder) 
  }
  
  # 'download.mapping.file checks' local, SBGNview.data, SBGNhub for mapping file
  # location values ("local", "SBGNview.data", "SBGNhub", "pathview")
  # local and SBGNhub run location, file names have .RData extension
  mapping.file.info <- download.mapping.file(input.type = input.type,
                                             output.type = output.type,
                                             species = species,
                                             SBGNview.data.folder = SBGNview.data.folder)
  
  # mapping table object already exists in environment
  if(mapping.file.info$location == "exists") { 
    message("\n", mapping.file.info$mapping.file.name, " mapping table exists in environment")
    mapping.list <- get(mapping.file.info$mapping.file.name)
    
  } else if(mapping.file.info$location == "local" || mapping.file.info$location == "SBGNhub") {
    
    # read local file or file downloaded from SBGNhub
    path.to.local.file <- paste(SBGNview.data.folder, mapping.file.info$mapping.file.name, sep = "/")
    var.name <- load(file = path.to.local.file)
    mapping.list <- get(var.name) # mapping.list returned
    message(mapping.file.info$mapping.file.name, " loaded from local")
    
  } else if (mapping.file.info$location == "SBGNview.data") {
    mapping.list <- get(mapping.file.info$mapping.file.name) # get data loaded from SBGNview.data
    
  } else {  # use pathview
    message("\nGenerating mapping using Pathview")
    if(mol.type == "gene") {
      # pathway.id used for heterogeneous purposes and mapping gene to pathway.
      # pathwayCommons and metacyc.SBGN are databases mentioned in pathways.stats
      # unique(pathways.info$macromolecule.ID.type) returns "pathwayCommons" "metacyc.SBGN"  "ENZYME"
      # ENZYME not in if condition below b/c it can be mapped directly
      if(any(c(input.type,output.type) %in% c("pathwayCommons","metacyc.SBGN","pathway.id")) &
         !any(c(input.type,output.type) %in% c("KO")) ) { # if input and output don't contain KO
        
        # load mapping table for mapping KO to glyph id
        ko.to.glyph.id <- loadMappingTable(input.type = output.type,
                                           output.type = "KO",
                                           mol.type = "gene",
                                           species = species,
                                           SBGNview.data.folder = SBGNview.data.folder)
        # load mapping table for mapping user input to KO id
        input.to.ko <- loadMappingTable(input.type = input.type,
                                        output.type = "KO",
                                        mol.type = "gene",
                                        limit.to.ids = limit.to.ids,
                                        species = species,
                                        SBGNview.data.folder = SBGNview.data.folder)
        input.to.glyph.id <- merge(input.to.ko, ko.to.glyph.id, all = FALSE)
        mapping.list <- input.to.glyph.id[,c(input.type,output.type)]
        
      } else {
        message("\nID mapping not pre-generated")
        # if(is.null(limit.to.ids)){
        #   stop("\nMust provide input IDs to 'limit.to.ids' argument when using pathview mapping!")
        # }
        mapping.list <- geneannot.map.ko(in.ids = limit.to.ids,
                                         in.type = input.type,
                                         out.type = output.type,
                                         species = species,
                                         unique.map= FALSE,
                                         SBGNview.data.folder = SBGNview.data.folder)
        if(is.vector(mapping.list)) {
          mapping.list <- as.matrix(t(mapping.list))
        }
      }
      
    } else if(mol.type == "cpd") {
      mapping.list <- generate.cpd.mapping.table(input.type = input.type, output.type = output.type,
                                                 limit.to.ids = limit.to.ids)
    } 
    
  } # end else use pathview
  
  # if mapping.list contains a species column, filter by value of 'species' argument
  if(!identical(species, character(0))) {
    if("species" %in% colnames(mapping.list) & !is.null(mapping.list)) { 
      
      message("Filtering mapping list by species: '", species, "'")
      if(!exists("korg")) data(korg, package="pathview")
      # species value is KEGG code based on 'species' argument documentation
      if(tolower(species) %in% korg[,3]) { 
        message("Species kegg code in 'korg' dataset")
        ridx <- match(species, korg[, 3]) %% nrow(korg)
        species.sci.name <- tolower(korg[ridx, 4])   # get scientific.name for input species
        
        if(any(mapping.list[,"species"] %in% species) ) { # check kegg code in species column
          mapping.list <- mapping.list[mapping.list[,"species"] %in% species, c(input.type,output.type)]
        } else if (any(tolower(mapping.list[,"species"]) %in% species.sci.name) ) {
          mapping.list <- mapping.list[tolower(mapping.list[,"species"]) %in% species.sci.name, c(input.type,output.type)]
          message("Filtered mapping list by '", species.sci.name, "'")
        } else {
          message("Was not able to filter mapping.list by ", species,  " or ", species.sci.name, ".")
        }
      } else {
        stop(species, " kegg code not in 'korg' dataset or incorrect kegg code.")
      }
      
    } # end if: filter 'species' column
  } # end if !is.null(species)
  
  if(!is.matrix(mapping.list)) mapping.list <- as.matrix(mapping.list)
  return(mapping.list)
}

#########################################################################################################
# uses generate.ko.mapping.list to generate mapping table from scratch using KEGGREST
# otherwise will use pathview
geneannot.map.ko <- function(in.ids = NULL, in.type, out.type, species = "hsa", unique.map, 
                             SBGNview.data.folder = "./SBGNview.tmp.data") {
  # pathview's geneannot.map can't map KO, so here included KO mapping
  if (!is.null(in.ids)) {
    in.ids <- gsub("\"", "", in.ids)
  }
  
  message("Mapping: ", in.type, ", ", out.type, "\n")
  
  if (any(c(in.type, out.type) %in% c("KO", "ko"))) {
    # if input/output KO/ENTREZID combo, set out.type to KO and in.type to ENTREZ
    # otherwise in.type and out.type order matter. if in.type = KO, generate.ko.mapping.list calls mapping.ko.to.arbitrary.id.type
    if(any(c(in.type, out.type) %in% c("ENTREZID"))) {
      in.type <- "ENTREZID"
      out.type <- "KO" 
    }
    
    message("Generating mapping list\n")
    id.map <- generate.ko.mapping.list(in.type = in.type, out.type = out.type, 
                                       species = species, in.ids = in.ids)
    message("\nGenerated mapping list using KEGGREST")
    message("Saving mapping list to current working directory")
    
    file.name <- paste(species, toupper(in.type), toupper(out.type), sep = "_")
    file.save.path <- paste(SBGNview.data.folder, paste(file.name, ".RData", sep = ""), sep = "/")
    assign(file.name, id.map)
    save(list = file.name, file = file.save.path)
    message("\nSaved generated mapping table at: ", file.save.path)
    
  } else { # input/output not KO
    if (species == "mmu") {
      species <- "mouse"
    }
    if (is.null(in.ids)) {
      stop("\nMust provide input IDs when using pathview mapping!")
    }
    id.map <- pathview::geneannot.map(in.ids = in.ids, in.type = in.type, out.type = out.type,
                                      org = species, unique.map = unique.map)
  }
  return(id.map)
}

#########################################################################################################
### generate mapping list on the fly for mapping between KO and other IDs in gene.idtype.list
# Pseudocode for this function:
# input = in.type, out.type ("ko"), species, in.ids
#   if species not in bods 
#       if in.type = entrez id
#           if can't be map to entrez -> stop
#           if "kegg.geneid" == "ncbi.geneid" for species 
#               no need of 2nd mapping table. Map from KEGG ID to KO using keggLink
#           else use two mapping tables 
#                entrez id to kegg id and kegg id to ko and merge the two lists for (entrez, ko) table
#       else 
#           if between KO and ncbiprotinid / uniprot, do mapping
#           else stop(can't map)
#   else (species in bods) 
#       if in.type == entrez
#           map to entrez similar to above
#       else if in.type = KO. map ko to type in gene.id.type.list for bods species
#       else use pathview::id2eg to map to entrezid
#           if "kegg.geneid" == "ncbi.geneid" for species 
#               map directly from KEGG ID to KO
#               merge with id2eg map to get (in.type, ko) table
#           else use 2nd mapping table
#               map from entrez id to keggid merge with id2eg map list
#               map from kegg id to ko and merge with first mereged list
generate.ko.mapping.list <- function(in.type, out.type, species, in.ids = NULL) {
  
  in.type <- tolower(in.type)
  out.type <- tolower(out.type)
  species <- tolower(species)
  
  # korg cols:: 3 = kegg.code; 6 = entrez.gnodes; 7 = kegg.geneid; 8 = ncbi.geneid
  if(!exists("korg")) data(korg, package="pathview")
  if(!exists("bods")) data(bods, package="pathview")
  
  if(!species %in% korg[, 3] & !species %in% bods[, 3]){ # stop if species is NOT in korg or bods
    stop("Incorrect KEGG species code!")
  } 
  
  if(!species %in% bods[, 3]) { # species not in bods
    # get species row index from korg
    ridx <- match(species, korg[, 3]) %% nrow(korg)
    message("'", species, "'", " species in korg dataset from Pathview")
    
    if(any(in.type %in% c("entrez", "eg", "entrezid"))){ # in.type == entrez
      
      if(is.na(korg[ridx, 8])){ # if ncbi.geneid == NA, can't map to entrezid
        stop("Can't map between ENTREZID and KO for ", species)
      }
      if(korg[ridx, 6] == "1"){   # "kegg.geneid" == "ncbi.geneid" (entrez.gnodes == 1)
        message("Mapping directly from ENTREZID to KO")
        mapping.list <- get.keggrest.data(tar = out.type,  src = species, 
                                          link.or.conv = "link", src.is.species = T)
        colnames(mapping.list) <- c("ENTREZID", "KO")
        
      } else { # kegg ID is NOT entrez id. entrez.gnodes == 0. need 2 mapping tables
        
        message("Mapping from ENTREZID to KEGG ID") # ncbi-geneid to kegg id
        conv.list <- get.keggrest.data(tar = species, src = "ncbi-geneid",
                                       link.or.conv = "conv", tar.is.species = T)
        
        message("Mapping from KEGG ID to KO") # kegg id to ko
        link.list <- get.keggrest.data(tar = "ko", src = species, 
                                       link.or.conv = "link", src.is.species = T)
        
        message("Merging mapping lists")
        merge.list <- merge(conv.list, link.list) # merge list. cols = keggid, ncbi-geneid, ko
        mapping.list <- merge.list[,2:3] # take only ncbi-geneid, ko
      }
    } else { # in.type is NOT entrez and species NOT in bods
      # map from in.type to kegg gene id =>  ko
      ## if input/output is ncbi-proteinid or uniprot for species in korg
      if(any(c(in.type, out.type) %in% c("ko")) & 
         any(c(in.type, out.type) %in% c("ncbi-proteinid", "uniprot"))) {
        in.type <- setdiff(c(in.type, out.type), "ko")
        out.type <- "ko"
        
        message("Mapping from ", toupper(in.type), " to KEGG ID") # in.type to kegg id
        conv.list <- get.keggrest.data(tar = species, src = in.type, 
                                       link.or.conv = "conv", tar.is.species = T)
        
        message("Mapping from KEGG ID to KO") # kegg id to ko
        link.list <- get.keggrest.data(tar = "ko", src = species, 
                                       link.or.conv = "link", src.is.species = T)
        
        message("Merging mapping lists")
        merge.list <- merge(conv.list, link.list) # merge mapping lists
        mapping.list <- merge.list[,2:3]
        
      } else {
        stop("This mapping cannot be done")
      }
    }
    
  } else { #### species in bods
    
    message("'", species, "'", " species in bods dataset from Pathview")
    # get species info from korg
    ridx <- match(species, korg[, 3]) %% nrow(korg)
    
    if(any(in.type %in% c("entrez", "eg", "entrezid"))) { # input type is entrez for bods species
      
      message("ID mapping from ENTREZID to KO for bods species")
      
      if(korg[ridx, 6] == "1"){   # "kegg.geneid" == "ncbi.geneid". entrez.gnodes == 1
        # no need of 2nd mapping table
        message("Mapping directly from ENTREZID to KO for bods species")
        mapping.list <- get.keggrest.data(tar = out.type, src = species,
                                          link.or.conv = "link", src.is.species = T)
        colnames(mapping.list) <- c("ENTREZID", "KO")
        
      } else { # kegg ID is NOT entrez id. entrez.gnodes == 0. need 2 mapping tables
        
        message("Mapping from ENTREZID to KEGG ID for bods species") # ncbi-geneid to kegg id
        conv.list <- get.keggrest.data(tar = species, src = "ncbi-geneid",
                                       link.or.conv = "conv", tar.is.species = T)
        
        message("Mapping from KEGG ID to KO for bods species") # kegg id to ko
        link.list <- get.keggrest.data(tar = "ko", src = species,
                                       link.or.conv = "link", src.is.species = T)
        
        message("Merging mapping lists")
        merge.list <- merge(conv.list, link.list) # merge list. cols = keggid, ncbi-geneid, ko
        mapping.list <- merge.list[,2:3] # take only ncbi-geneid, ko
        
      } # end if condition for input type is entrez for species in bods
      
    } else if (in.type == "ko") { ### if input is ko, and species in bods
      # map from KO to output type in gene.idtype.list for bods species
      # input.ko.ids required
      message("KO to ", toupper(out.type))
      mapping.list <- mapping.ko.to.arbitrary.id.type(input.ko.ids = in.ids, 
                                                      output.type = toupper(out.type),
                                                      species = species, 
                                                      ridx = ridx)
      
    } else { # input type not entrez. other input type
      # map from in.type to ENTREZID using pathview
      message("Mapping from ", toupper(in.type), " to ENTREZID for '", 
              species, "' using pathview::id2eg")
      
      if(is.null(in.ids)){
        stop("Need vector of input ids to use pathview::id2eg")
      }
      
      in.to.eg <- pathview::id2eg(ids = in.ids, category = in.type, org = species) # in.type, entrez
      
      if(korg[ridx, 6] == "1") { # check if KEGG id == ENTREZID in korg for bods species
        
        message("Mapping directly from ENTREZID to KO for bods species") 
        map.list <- get.keggrest.data(tar = out.type, src = species,  
                                      link.or.conv = "link", src.is.species = T) # kegg, ko
        colnames(map.list) <- c("ENTREZID", "KO")
        
        message("Merging mapping lists for bods species")
        merge.list <- merge(in.to.eg, map.list) # merge in.type, entrezid with keggid, ko
        mapping.list <- merge.list[,2:3] 
        
      } else { # KEGG id != ENTREZID. need 2nd mapping table
        message("Mapping from ENTREZID to KEGG ID for bods species") # ncbi-geneid to kegg id
        conv.list <- get.keggrest.data(tar = species, src = "ncbi-geneid",
                                       link.or.conv = "conv", tar.is.species = T)
        
        # in.type, entrezid merge with entrezid, keggid
        merge.list.1 <- merge(in.to.eg, conv.list)
        map.list.1 <- merge.list.1[,2:3]  # in.type, keggid
        
        message("Mapping from KEGG ID to KO for bods species") # kegg id to ko
        link.list <- get.keggrest.data(tar = "ko", src = species,
                                       link.or.conv = "link", src.is.species = T)
        
        # merge in.type, keggid with  keggid, ko
        message("Merging mapping lists for bods species")
        merge.list.2 <- merge(map.list.1, link.list) 
        mapping.list <- merge.list.2[,2:3] # in.type, ko
      }
      
    } # end else in.type != entrez
    
  } # end else species in bods
  
  return(mapping.list)
}

#########################################################################################################
# mapping from input type KO to output tpye (in gene.idtype.list) for species in bods dataset
# using keggrest and pathview::eg2id. Need vector of input of KO ids
mapping.ko.to.arbitrary.id.type <- function(input.ko.ids = NULL, output.type, species, ridx) {
  
  if(is.null(input.ko.ids)){
    stop("Need vector of input KO IDs")
  }
  
  if(!toupper(output.type) %in% gene.idtype.list) {
    stop("Output type not in 'gene.id.type.list'")
  }
  
  if(korg[ridx, 6] == "1") {   # default "kegg.geneid" is "ncbi.geneid". entrez.gnodes == 1
    
    message("Mapping KO to ENTREZID for bods species") 
    ko.to.entrez <- get.keggrest.data(tar = species, src = "ko",
                                      link.or.conv = "link", tar.is.species = T) # ko, keggid/entrez 
    colnames(ko.to.entrez) <- c("KO", "ENTREZID")
    
  } else { # kegg Id is not deault. so map KO to KEGGID, KEGGID to ncbiID, merge
    
    message("Mapping KO to KEGGID")
    ko.to.kegg <- get.keggrest.data(tar = species, src = "ko",
                                    link.or.conv = "link", tar.is.species = T)
    message("Mapping KEGGID to NCBI-GENEID")
    kegg.to.ncbi <- get.keggrest.data(tar = "ncbi-geneid", src = species,
                                      link.or.conv = "conv", src.is.species = T)
    message("Merging mapping lists")
    ko.to.ncbi <- merge(ko.to.kegg, kegg.to.ncbi)
    ko.to.entrez <- ko.to.ncbi[, 2:3]
    colnames(ko.to.entrez) <- c("KO", "ENTREZID")
  }
  
  # match input KO ids with entrez IDS in generated mapping table
  input.ko.ids <- unique(input.ko.ids)
  ko.to.entrez <- as.matrix(ko.to.entrez)
  filterd.ko.to.entrez <- subset(ko.to.entrez, ko.to.entrez[,1] %in% input.ko.ids)
  
  # use pathview::eg2id to map entrez to output type and merge to get KO, SYMBOL mapping list
  filterd.ko.to.entrez <- as.matrix(filterd.ko.to.entrez)
  entrez.to.output <- pathview::eg2id(eg = filterd.ko.to.entrez[,2], 
                                      category = toupper(output.type),
                                      org = species)
  mapping.list <- merge(filterd.ko.to.entrez, entrez.to.output)
  mapping.list <- mapping.list[, 2:3]
  
  return(mapping.list)
}

#########################################################################################################
# get data using keggLink and keggConv to get mapping data and format to a list for generate.ko.mapping.list
get.keggrest.data <- function(tar, src, link.or.conv, tar.is.species = F, src.is.species = F) {
  # tar and src should be lower case
  if(link.or.conv == "link"){ # keggLink
    kegg.data <- KEGGREST::keggLink(target = tar, source = src)
  } else { # keggConv
    kegg.data <- KEGGREST::keggConv(target = tar, source = src)
  }
  
  ids1 <- gsub("^.+:", "", names(kegg.data))
  ids2 <- gsub("^.+:", "", kegg.data)
  map.list <- cbind(ids1, ids2)
  rownames(map.list) <- NULL
  
  if(src == "ncbi-geneid") { src <- "ENTREZID" }
  if(src.is.species == TRUE) {src <- paste("KEGGID", src, sep = "-")}
  if(tar.is.species == TRUE) {tar <- paste("KEGGID", tar, sep = "-")}
  
  colnames(map.list) <- c(toupper(src), toupper(tar))
  
  return(map.list)
}

#########################################################################################################
# used in loadMappingTable function
# using chebi as bridge ID to map from IDs in data(cpd.simtypes) from pathview to 
# glyph IDs of pathwayCommons, pathway.id, or metacyc.SBGN
# if input type and output type in data(cpd.simtypes), use pathview::cpdidmap
# else map to kegg using pathview::cpd2kegg, kegg to chebi, chebi to glyph ID
generate.cpd.mapping.table <- function(input.type, output.type, limit.to.ids) {

  if(!exists("cpd.simtypes")) data("cpd.simtypes", package = "pathview")
  # input/output validation
  valid.in.out.types <- c(cpd.simtypes, "KEGG", "kegg", "pathwayCommons", "pathway.id", "metacyc.SBGN", "compound.name")
  if(!input.type %in% valid.in.out.types | !output.type %in% valid.in.out.types) {
    stop("'input.type' needs to be one of values in data(cpd.simtypes) or 'KEGG' if input.type is KEGG COMPOUND/DRUG accession.
    'output.type' can be one of values in data(cpd.simtypes) or one of 'pathwayCommons','pathway.id', 'metacyc.SBGN', 'compound.name'.")
  }
  if(is.null(limit.to.ids)) {
    stop("\nMust provide input IDs to 'limit.to.ids' argument when using pathview mapping!")
  }
  if(input.type %in% c("pathwayCommons", "pathway.id", "metacyc.SBGN", "KEGG", "kegg") & output.type %in% cpd.simtypes) {
    temp <- input.type
    input.type <- output.type
    output.type <- temp
  }
  if(input.type %in% c("pathwayCommons", "pathway.id", "metacyc.SBGN") & output.type %in% c("KEGG", "kegg")) {
    temp <- input.type
    input.type <- output.type
    output.type <- temp
  }
  # map between IDs in cpd.simtypes
  if(input.type %in% cpd.simtypes & output.type %in% c(cpd.simtypes, "KEGG", "kegg")) {
    message("Using pathview::cpdidmap to map ", input.type, " and ", output.type)
    mapping.list <- pathview::cpdidmap(in.ids = limit.to.ids, in.type = input.type,
                                       out.type = output.type)
  } else { # map to glyph ID type
    input.cpd.to.kegg <- limit.to.ids
    # if input.type not native KEGG, map to kegg ID first
    if(!input.type %in% c("KEGG COMPOUND accession", "KEGG DRUG accession", "KEGG", "kegg")) {
      message("Using pathview::cpd2kegg to map ", input.type, " to KEGG")
      input.cpd.to.kegg <- pathview::cpd2kegg(in.ids = limit.to.ids, in.type = input.type)
      colnames(input.cpd.to.kegg) <- c(input.type, "kegg")
    } else {
      # native kegg
      message("Input is a native KEGG compound ID type")
      input.cpd.to.kegg <- matrix(input.cpd.to.kegg)
      colnames(input.cpd.to.kegg) <- c("kegg")
      input.type <- "kegg"
    }
    # chebi_kegg.ligand doesn't contain DRUG and COMPOUND accession data
    # using chebi_kegg that contains both accessions data
    message("Mapping ", input.type, " to ", output.type)
    if(!exists("chebi_kegg")) data("chebi_kegg", package = "SBGNview.data")
    chebi.to.kegg <- as.matrix(get("chebi_kegg"))
    
    input.type.to.chebi <- merge(input.cpd.to.kegg, chebi.to.kegg)
    input.type.to.chebi <- input.type.to.chebi[ , c(input.type, "chebi")]
    
    chebi.to.ouput.type.map.table <- paste("chebi_", output.type, sep = "")
    if(!exists(chebi.to.ouput.type.map.table)) { 
      data(list = chebi.to.ouput.type.map.table, package = "SBGNview.data") 
    }
    chebi.to.output.type <- as.matrix(get(chebi.to.ouput.type.map.table))
    
    mapping.list <- merge(input.type.to.chebi, chebi.to.output.type)
    mapping.list <- mapping.list[, c(input.type, output.type)]
    
    if(length(mapping.list) == 0 | dim(mapping.list)[1] == 0) {
      stop("Mapping from ", input.type, " to ", output.type, " did not work. Please check input IDs type and try again.")
    }
  } # end else map to glyph ID
  message("Finished mapping from ", input.type, " to ", output.type)
  
  return(mapping.list)
}

#########################################################################################################
# find all mapping tables for gene and compound id types and combine into a list
load.id.mapping.list.all <- function(SBGN.file.cpd.id.type = NULL, SBGN.file.gene.id.type = NULL, 
                                     output.gene.id.type = NULL, output.cpd.id.type = NULL, species = NULL, SBGNview.data.folder = "./SBGNview.tmp.data") {
  idmapping.all.list <- list()
  cpd.id.mapping.list <- list()
  gene.id.mapping.list <- list()
  if (!is.null(output.cpd.id.type)) {
    if (SBGN.file.cpd.id.type != output.cpd.id.type) {
      input.cpd.type <- SBGN.file.cpd.id.type
      cpd.mapping.table <-  loadMappingTable(output.type = output.cpd.id.type, 
                                             input.type = input.cpd.type, mol.type = "cpd", species = species, 
                                             SBGNview.data.folder = SBGNview.data.folder)
      type.pair.name <- paste(sort(c(input.cpd.type, output.cpd.id.type), method = "radix",
                                   decreasing = TRUE), collapse="_")
      cpd.id.mapping.list[["cpd"]] <- list()
      cpd.id.mapping.list[["cpd"]][[type.pair.name]] <- cpd.mapping.table
    }
  }
  
  if (!is.null(output.gene.id.type)) {
    if (SBGN.file.gene.id.type != output.gene.id.type) {
      input.gene.type <- SBGN.file.gene.id.type
      gene.mapping.table <- loadMappingTable(output.type = output.gene.id.type, 
                                             input.type = input.gene.type, mol.type = "gene", species = species, 
                                             SBGNview.data.folder = SBGNview.data.folder)
      type.pair.name <- paste(sort(c(input.gene.type, output.gene.id.type), method = "radix",
                                   decreasing = TRUE), collapse="_")
      gene.id.mapping.list[["gene"]] <- list()
      gene.id.mapping.list[["gene"]][[type.pair.name]] <- gene.mapping.table
    }
  }
  id.mapping.all.list <- c(cpd.id.mapping.list, gene.id.mapping.list)
  return(id.mapping.all.list)
}

#########################################################################################################
# get all ID mapping tables for gene and compound ID types
load.all.ids.mapping <- function(database, all.pairs.id.mapping.list, species, output.gene.id.type, 
                                 output.cpd.id.type, SBGNview.data.folder = "./SBGNview.tmp.data") {
  
  if (database == "MetaCrop") {
    sbgn.id.attr <- "label"
    SBGN.file.gene.id.type <- "ENZYME"
    SBGN.file.cpd.id.type <- "compound.name"  # "CompoundName"
  } else if (database == "MetaCyc") {
    sbgn.id.attr <- "id"
    SBGN.file.gene.id.type <- "metacyc.SBGN"
    SBGN.file.cpd.id.type <- "metacyc.SBGN"
  } else if (database == "pathwayCommons") {
    sbgn.id.attr <- "id"
    SBGN.file.gene.id.type <- "pathwayCommons"
    SBGN.file.cpd.id.type <- "pathwayCommons"
  }
  
  if (is.na(output.gene.id.type)) {
    output.gene.id.type.use <- SBGN.file.gene.id.type
  } else {
    output.gene.id.type.use <- output.gene.id.type
  }
  if (is.na(output.cpd.id.type)) {
    output.cpd.id.type.use <- SBGN.file.cpd.id.type
  } else {
    output.cpd.id.type.use <- output.cpd.id.type
  }
  mapped.id.names <- paste(c(SBGN.file.cpd.id.type, SBGN.file.gene.id.type, output.gene.id.type.use, 
                             output.cpd.id.type.use), collapse = "_")
  if (mapped.id.names %in% names(all.pairs.id.mapping.list)) {
    id.mapping.all.list <- all.pairs.id.mapping.list[[mapped.id.names]]
  } else {
    id.mapping.all.list <- load.id.mapping.list.all(SBGN.file.cpd.id.type, SBGN.file.gene.id.type, 
                                                    output.gene.id.type.use, output.cpd.id.type.use, species = species, SBGNview.data.folder = SBGNview.data.folder)
    all.pairs.id.mapping.list[[mapped.id.names]] <- id.mapping.all.list
  }
  return(list(all.pairs.id.mapping.list = all.pairs.id.mapping.list, id.mapping.all.list = id.mapping.all.list, 
              output.cpd.id.type.use = output.cpd.id.type.use, output.gene.id.type.use = output.gene.id.type.use, 
              SBGN.file.cpd.id.type = SBGN.file.cpd.id.type, SBGN.file.gene.id.type = SBGN.file.gene.id.type))
}

#########################################################################################################
#' Retrieve gene list or compound list from collected databases
#' 
#' @param id.type A character string. Default: "ENTREZID". The ID type of output gene list. One of the supported types in \code{data('mapped.ids')}
#' @param mol.type A character string. Default: "gene". One of 'gene' or 'cpd'
#' @param species A character string. Default: "hsa". The three letter species code used by KEGG. E.g. 'hsa','mmu'
#' @param database A character string. Default: "pathwayCommons". The database where gene list will be extracted. Acceptable values: 'MetaCyc', 'pathwayCommons', 'MetaCrop'. The value is case in-sensitive.
#' @param output.pathway.name Logical. Default: T. If set to 'TRUE', the names of returned list are in the format: 'pathway.id::pathway.name'. If set to 'FALSE', the format is 'pahtway.id'
#' @param combine.duplicated.set Logical. Default: T. Some pathways have the same geneset. If this parameter is set to 'TRUE', the output list will combine pathways that have the same gene set. The name in the list will be pathway names concatinated with '||'
#' @param truncate.name.length Integer. Default: 50. The pathway names will be truncated to at most that length. 
#' @param SBGNview.data.folder A character string. Default: "./SBGNview.tmp.data". The path to a folder that will hold downloaded ID mapping files and pathway information data files.
#' @return A list. Each element is a genelist of a pathway.
#' @examples 
#' data(pathways.info)
#' mol.list <- sbgn.gsets(id.type = 'ENTREZID',
#'                        species = 'hsa',
#'                        database = 'pathwayCommons')
#'   
#' @export

sbgn.gsets <- function(id.type = "ENTREZID", mol.type = "gene", species = "hsa", database = "pathwayCommons", 
                       output.pathway.name = TRUE, combine.duplicated.set = TRUE, 
                       truncate.name.length = 50, SBGNview.data.folder = "./SBGNview.tmp.data") {
  
  if(id.type %in% c("ENTREZID", "eg", "entrez", "entrezid")) id.type <- "ENTREZID"
  if(id.type == "CompoundName") id.type <- "compound.name"
  if(id.type %in% c("kegg.ligand", "KEGG")) id.type <- "kegg"
  
  if (tolower(database) == "metacrop") {
    if (mol.type == "gene") {
      id.in.pathway <- "ENZYME"
    } else {
      id.in.pathway <- "compound.name" # "CompoundName"
    }
    output.pathways <- subset(pathways.info, database == "MetaCrop", select = "pathway.id")
  } else if (tolower(database) %in% c("pathwaycommons", "metacyc")) {
    if (mol.type == "gene") {
      id.in.pathway <- "KO"
    } else {
      id.in.pathway <- "chebi"
    }
    output.pathways <- subset(pathways.info, database != "MetaCrop", select = "pathway.id")
  } else { # if wrong value for database
    stop("'database' argument value is incorrect. Acceptable values are: 'MetaCyc', 'pathwayCommons', 'MetaCrop'")
  }
  # metacrop initial list is using enzyme
  ref.to.pathway <- loadMappingTable(input.type = id.in.pathway, output.type = "pathway.id", 
                                   mol.type = mol.type, species = species, SBGNview.data.folder = SBGNview.data.folder)

  if (id.type == id.in.pathway) {
    out.id.to.pathway <- ref.to.pathway
    # change KO to output id
  } else {
    message("Mapping ", id.in.pathway, " to ", id.type)
    
    out.id.type.to.ref <- loadMappingTable(input.type = id.in.pathway, output.type = id.type, 
                                           mol.type = mol.type, limit.to.ids = ref.to.pathway[, id.in.pathway], 
                                           species = species, SBGNview.data.folder = SBGNview.data.folder)
    
    out.id.type.to.ref <- out.id.type.to.ref 
    # merge KO to pathway and KO to output id
    out.id.to.pathway <- merge(out.id.type.to.ref, ref.to.pathway, all.x = TRUE)
    out.id.to.pathway <- out.id.to.pathway[, c(id.type, "pathway.id")]
    out.id.to.pathway <- unique(out.id.to.pathway)
    out.id.to.pathway <- out.id.to.pathway[!is.na(out.id.to.pathway[, 2]), ]
  }
  out.id.to.pathway <- unique(out.id.to.pathway)
  out.id.to.pathway <- out.id.to.pathway[out.id.to.pathway[, "pathway.id"] %in% 
                                           output.pathways$pathway.id, ]
  out.id.to.pathway <- out.id.to.pathway[out.id.to.pathway[, id.type] != 
                                           "", ]
  out.id.to.pathway <- split(as.character(out.id.to.pathway[, id.type]), 
                             out.id.to.pathway[, "pathway.id"])
  out.id.to.pathway <- out.id.to.pathway[!is.na(names(out.id.to.pathway))]
  
  if (output.pathway.name) {
    # merge pathway names
    pathway.id.to.name <- pathways.info[, c("pathway.id", "pathway.name")]
    row.names(pathway.id.to.name) <- pathway.id.to.name[, "pathway.id"]
    pathway.ids <- paste(pathway.id.to.name[, "pathway.id"], pathway.id.to.name[, 
                                                                                "pathway.name"], sep = "::")
    names(pathway.ids) <- pathway.id.to.name[, "pathway.id"]
    names(out.id.to.pathway) <- pathway.ids[names(out.id.to.pathway)]
  }
  
  if (combine.duplicated.set) {
    sets <- lapply(out.id.to.pathway, function(ids) {
      ids.str <- paste(sort(ids), collapse = "||")
    })
    sets <- unlist(sets)
    pathways.same.set <- tapply(names(sets), as.factor(sets), function(pathways) {
      pathways.joint <- paste(sort(pathways), collapse = "||")
    })
    out.id.to.pathway <- as.list(names(pathways.same.set))
    names(out.id.to.pathway) <- pathways.same.set
    out.id.to.pathway <- lapply(out.id.to.pathway, function(ids) {
      strsplit(ids, "\\|\\|")[[1]]
    })
  }
  names(out.id.to.pathway) <- substr(names(out.id.to.pathway), 1, truncate.name.length)
  
  sorted.names <- sort(names(out.id.to.pathway), method = "radix", decreasing = FALSE)
  out.id.to.pathway <- out.id.to.pathway[sorted.names]
  return(out.id.to.pathway)
}

#########################################################################################################
#' Download pre-generated SBGN-ML file from GitHub
#' 
#' This function will download a SBGN-ML file from our pre-collected SBGN-ML files given the 'pathway.id' argument.
#' 
#' @param pathway.id A character string. The ID of pathway. For accepted pathway IDs, please check \code{data('pathways.info')}. IDs are in column 'pathway.id' (pathways.info[,'pathway.id'])
#' @param download.folder A character string. Default: "./". The output folder to store created SBGN-ML files.
#' @return A vector of character strings. The path to the created SBGN-ML files.
#' @examples 
#' data("pathways.info")
#' data("sbgn.xmls")
#' input.sbgn <- downloadSbgnFile(pathway.id = pathways.info[1,'pathway.id'],
#'                               download.folder = './')
#' @export

downloadSbgnFile <- function(pathway.id, download.folder = "./") {
  
  if (!file.exists(download.folder)) {
    dir.create(download.folder)
  }
  if (any(pathway.id %in% c("AF_Reference_Card.sbgn", "PD_Reference_Card.sbgn", 
                            "ER_Reference_Card.sbgn"))) {
    sbgn.file.names <- pathway.id
  } else {
    sbgn.file.names <- pathways.info[pathways.info[, "pathway.id"] %in% pathway.id, 
                                     "file.name"]
  }
  if (length(sbgn.file.names) == 0) {
    print("pathway.id")
    stop("Only pathway IDs in pathways.info[,\"pathway.id\"] are supported!!!\n ")
  }
  database.name <- pathways.info[pathways.info[, "pathway.id"] %in% pathway.id, 
                                 "database"]
  output.files <- c()
  for (i in seq_len(length.out = length(sbgn.file.names))) {
    sbgn.file.name <- gsub("\"", "", sbgn.file.names[i])
    output.file <- paste(download.folder, "/", sbgn.file.name, sep = "")
    if (!file.exists(output.file)) {
      if (sbgn.file.name %in% names(sbgn.xmls)) {
        write(sbgn.xmls[[sbgn.file.name]], file = output.file)
      } else {
        stop(sbgn.file.name, " is not included in SBGNview.data package!")
      }
      
    } else {
      message("SBGN file exists:")
      message(output.file)
    }
    output.files <- c(output.files, (as.character(output.file)))
  }
  output.files
  return(output.files)
}

#########################################################################################################
# find pathways in pathways.info based on input keywords and keyword matching logic
find.pathways.by.keywords <- function(keywords, keyword.type, keywords.logic, mol.name.match, 
                                      SBGNview.data.folder = "./SBGNview.tmp.data/") {
  if (is.null(keywords)) {
    pathways <- pathways.info[, c("pathway.id", "pathway.name", "sub.database", "database")]
  } else if (keyword.type == "pathway.name") {
    keywords <- tolower(keywords)
    pathways.info[, "pathway.name"] <- tolower(pathways.info[, "pathway.name"])
    
    if.selected <- grepl(pattern = keywords[1], ignore.case = TRUE, pathways.info$pathway.name)
    if (length(keywords) > 1) {
      for (i in 2:length(keywords)) {
        if (keywords.logic == "and") {
          if.selected <- if.selected & grepl(pattern = keywords[i], ignore.case = TRUE, 
                                             pathways.info$pathway.name)
        } else if (keywords.logic == "or") {
          if.selected <- if.selected | grepl(pattern = keywords[i], ignore.case = TRUE, 
                                             pathways.info$pathway.name)
        } else {
          stop("keywords.logic must be one of 'and' or 'or'!! ")
        }
      }
    }
    pathways <- pathways.info[if.selected, c("pathway.id", "pathway.name", "sub.database", "database")]
    
  } else {
    # using IDs to search for pathway type.pair.name =
    # paste(keyword.type,'_pathway.id',sep='')
    if(!exists("mapped.ids")) data(mapped.ids)
    # if (keyword.type %in% mapped.ids$gene) {
    if (keyword.type %in% unique(mapped.ids$gene[,2])) {
      mol.type <- "gene"
    } else if (keyword.type %in% mapped.ids$cpd) {
      mol.type <- "cpd"
    }
    mapping.table <- loadMappingTable(input.type = keyword.type, output.type = "pathway.id", 
                                     species = NULL, SBGNview.data.folder = SBGNview.data.folder, mol.type = mol.type)
    
    if (keyword.type == "compound.name") { # keyword.type == "CompoundName"
      
      if (keywords.logic == "and") {
        keywords.logic <- "or"
        warning("Searching pathways by Compound Names. Using keywords.logic='or'. Please don't set keywords.logic to 'and'!!!\n")
      }
      keywords <- tolower(keywords)
      mapping.table[, keyword.type] <- tolower(as.character(mapping.table[, keyword.type]))
      if (mol.name.match == "exact.match") {
        if.selected <- tolower(mapping.table[, keyword.type]) %in% keywords
        pathways <- mapping.table[if.selected, ]
      } else if (mol.name.match == "presence.of.input-string.in.target-name") {
        if.selected <- grepl(pattern = paste(keywords, collapse = "|"), ignore.case = TRUE, 
                             mapping.table[, keyword.type])
        sum(if.selected)
        pathways <- mapping.table[if.selected, ]
      } else if (mol.name.match == "jaccard") {
        best.match <- match.names(input.names = tolower(keywords), 
                                  output.names = tolower(mapping.table[, keyword.type]))
        pathways <- merge(best.match, mapping.table, by.x = "output.name", 
                          by.y = keyword.type)
        names(pathways)[1] <- c("compound.name")
      }
    } else {
      pathways <- mapping.table[tolower(mapping.table[, keyword.type]) %in% tolower(keywords), ]
    }
  }
  return(pathways = pathways)
}

#########################################################################################################
# given two vectors of characters/words, this function find the best match
# between words in the two vectors, by splitting a word into sub-strings and use
# 'jaccard' to calculate two words' similarity (similarity between their
# sub-string vectors). It for one name, the function will output the pair with
# largest similarity score.
match.names <- function(input.names, output.names, output.file = NULL) {
  
  input.names <- tolower(input.names)
  output.names <- tolower(output.names)
  input.names <- unique(input.names)
  output.names <- unique(output.names)
  
  all.pairs <- expand.grid(input.names, output.names)
  
  # calculate jaccard of all possible matches
  ot <- apply(all.pairs, 1, function(pair) {
    p1n <- pair[1]
    p2n <- pair[2]
    p1 <- strsplit(p1n, "[^a-zA-Z0-9]+", perl = TRUE)[[1]]
    p1 <- unique(p1)
    p1
    p2 <- strsplit(p2n, "[^a-zA-Z0-9]+", perl = TRUE)[[1]]
    p2 <- unique(p2)
    p2
    inter <- length(intersect(p1, p2))
    uni <- length(unique(union(p1, p2)))
    jc <- inter/uni
    return(c(p1n, p2n, jc))
  })
  ot <- t(ot)
  
  ot <- as.data.frame(ot, stringsAsFactors = FALSE)
  names(ot) <- c("input.name", "output.name", "jaccard")
  ot$jaccard <- as.numeric(ot$jaccard)
  
  # extract the pairs with max jaccard
  ot1 <- ot
  mapped.table <- by(ot1, as.factor(ot1$input.name), function(df) {
    if.max <- df$jaccard == max(df$jaccard)
    mx.df <- df[if.max, ]
  })
  mapped.table <- do.call(rbind, mapped.table)
  row.names(mapped.table) <- c()
  if (!is.null(output.file)) {
    write.table(mapped.table, output.file, sep = "\t", row.names = FALSE, quote = FALSE)
  }
  return(mapped.table)
}

#########################################################################################################
# find pathways based on organism and pathway completeness in a species
filter.pathways.by.org <- function(pathways, org, pathway.completeness, 
                                   pathways.info.file.folder = "./SBGNview.tmp.data") {
  
  org <- tolower(org)
  if(!exists("pathway.species.pct_Mapped")) data("pathway.species.pct_Mapped")
  if (org == "all") {
    org <- unique(pathway.species.pct_Mapped$species)
  }
  pathway.species.pct_Mapped[, "species"] <- as.character(pathway.species.pct_Mapped[, "species"])
  pathway.species.pct_Mapped[, "pathway"] <- as.character(pathway.species.pct_Mapped[,  "pathway"])
  
  if (!is.null(pathway.completeness)) {
    message("Using user provided pathway completeness cutoff: the same cutoff for different pathways!\n")
    
    pathway.ids <- pathway.species.pct_Mapped[pathway.species.pct_Mapped$species %in% 
                                                org & pathway.species.pct_Mapped$pct.mapped.species.pathway >= pathway.completeness, ]
  } else {
    message("Using pre-generated pathway-specific completeness cutoff: different cutoff for different pathways! Pathway 'exist' and 'not exist' accross all species defined by this cutoff have the largest ANOVA F statistic when comparing completeness between 'exist' and 'not exist' groups. \n")
    pathway.ids <- pathway.species.pct_Mapped[pathway.species.pct_Mapped$species %in% org, ]
    if(!exists("pathway.completeness.cutoff.info")) data("pathway.completeness.cutoff.info")
    pathway.specific.cutoff <- pathway.completeness.cutoff.info$cutoff
    names(pathway.specific.cutoff) <- pathway.completeness.cutoff.info$pathway
    if.pass.cutoff <- pathway.ids$pct.mapped.species.pathway > pathway.specific.cutoff[pathway.ids$pathway]
    pathway.ids <- pathway.ids[if.pass.cutoff, ]
  }
  colnames(pathway.ids)[1] <- "pathway.id"
  pathways <- merge(pathways, pathway.ids, all = FALSE)
  return(pathways)
}

#########################################################################################################
#' Retrieve pathways by keywords
#' 
#' This function searches for pathways by input keywords.
#' 
#' @param keywords A character string or vector. The search is case in-sensitive.
#' @param keywords.logic A character string. Options are 'and' or 'or' (default). This will tell the function if the search require 'all' or 'any' of the keywords to be present. It only makes difference when keyword.type is 'pathway.name'.
#' @param keyword.type A character string. Either 'pathway.name' (default) or one of the ID types in \code{data('mapped.ids')}
#' @param org  A character string. The KEGG species code.
#' @param SBGNview.data.folder A character string. Default: "./SBGNview.tmp.data". The path to a folder that will hold downloaded ID mapping files and pathway information data files.
#' @details If 'keyword.type' is 'pathway.name' (default), this function will search for the presence of any keyword in the pathway.name column of data(pathways.info). The search is case in-sensitive. If 'keyword.type' is one of the identifier types and 'keywords' are corresponding identifiers, this function will return pathways that include nodes mapped to input identifiers. 
#' @return A dataframe. Contains information of pathways found.
#' @examples 
#' data(pathways.info)
#' input.pathways <- findPathways('Adrenaline and noradrenaline biosynthesis')
#' @export

findPathways <- function(keywords = NULL, keywords.logic = "or", keyword.type = "pathway.name", 
                         org = NULL, SBGNview.data.folder = "./SBGNview.tmp.data") {
  
  pathway.completeness = NULL
  mol.name.match = c("exact.match", "jaccard", 
                     "presence.of.input-string.in.target-name")[1]
  if (!file.exists(SBGNview.data.folder)) {
    dir.create(SBGNview.data.folder)
  }
  
  keywords <- as.vector(keywords)
  # changed CompoundName to compound.name and kegg.ligand to kegg in mapped.ids and mapping file names
  if(keyword.type == "CompoundName") keyword.type <- "compound.name"
  if(keyword.type %in% c("kegg.ligand", "KEGG")) keyword.type <- "kegg"
  
  pathways <- find.pathways.by.keywords(keywords, keyword.type, keywords.logic, 
                                        mol.name.match, SBGNview.data.folder = SBGNview.data.folder)
  
  if (!is.null(org)) {
    pathways <- filter.pathways.by.org(pathways, org, pathway.completeness, pathways.info.file.folder = "./SBGNview.tmp.data")
  }
  pathways.with.name <- pathways.info[, c("pathway.id", "pathway.name")]
  pathways <- merge(pathways, pathways.with.name, all.x = TRUE)
  pathways <- unique(pathways)
  
  return(pathways)
}

#########################################################################################################
############# Below are previous versions of some of the above functions. 
############# These are kept for reference purposes.
#########################################################################################################
### original (old version) function replaced with new version. 
# download.mapping.file <- function(input.type, output.type, 
#                                   species = NULL, SBGNview.data.folder = "./SBGNview.tmp.data/") {
#   options(warn = -1)
#   # R CMD check will give different sort result if you didn't specify 'method': the
#   # default sort method depends on the locale of your system. And R CMD check uses
#   # a different locale than my interactive session.  The issue resided in this: R
#   # interactively used:LC_COLLATE=en_US.UTF-8; R CMD check used: LC_COLLATE=C;
#   # https://stackoverflow.com/questions/42272119/r-cmd-check-fails-devtoolstest-works-fine
#   type.pair.name.1 <- paste(sort(c(input.type, output.type), method = "radix", 
#                                  decreasing = FALSE), collapse = "_")
#   type.pair.name.2 <- paste(sort(c(input.type, output.type), method = "radix", 
#                                  decreasing = TRUE), collapse = "_")
#   type.pair.name.1.org <- paste(species, type.pair.name.1, sep = "_")
#   type.pair.name.2.org <- paste(species, type.pair.name.2, sep = "_")
#   try.file.names <- c(type.pair.name.1.org, type.pair.name.2.org, type.pair.name.1, 
#                       type.pair.name.2)
#   if.online.mapping.avai <- FALSE
#   if.mapping.in.data.package <- FALSE
#   for (i in seq_len(length(try.file.names))) {
#     type.pair.name.try <- try.file.names[i]
#     if.data.exists <- tryCatch(data(list = type.pair.name.try), warning = function(w) "no such data")
#     if (if.data.exists %in% c(type.pair.name.try, "mapping.list", "mapping.table")) {
#       mapping.file.name <- type.pair.name.try
#       message("ID mapping ", type.pair.name.try, " is included in SBGNview.data package. Loading it.")
#       if.mapping.in.data.package <- TRUE
#       (break)()
#     } else {
#       print("ID mapping not in data package for this pair")
#       print(type.pair.name.try)
#       print(if.data.exists)
#     }
#   }
#   if (!if.mapping.in.data.package) {
#     # stop("Can't map this pair of IDs with SBGNview.data!", type.pair.name.try,  "\n")
#     warning("Can't map this pair of IDs with SBGNview.data! Generating mapping with Pathview!", type.pair.name.try,  "\n")
#     mapping.file.name = "online mapping not available"
#   }
#   options(warn = 1)
#   return(mapping.file.name)
# }

#########################################################################################################
### original (old version) of the function
# loadMappingTable <- function(output.type, input.type, species = NULL, cpd.or.gene, 
#                              limit.to.ids = NULL, SBGNview.data.folder = "./SBGNview.tmp.data"){ 
#   
#   input.type = gsub("entrez","ENTREZID",input.type)
#   output.type = gsub("entrez","ENTREZID",output.type)
#   if(input.type == output.type){
#     stop("Input type and output types are the same!")
#   }
#   species = gsub("Hs","hsa",species)
#   type.pair.name = paste(sort(c(input.type,output.type),method = "radix",decreasing = TRUE),collapse="_") # R CMD check will give different sort result if we didn't specify "method":  the default sort method depends on the locale of your system. And R CMD check uses a different locale to R interactive session. The issue resided in this: R interactively used:LC_COLLATE=en_US.UTF-8; R CMD check used: LC_COLLATE=C; https://stackoverflow.com/questions/42272119/r-cmd-check-fails-devtoolstest-works-fine
#   if(!file.exists(SBGNview.data.folder)){
#     dir.create(SBGNview.data.folder)
#   }
#   
#   mapping.file.name <- download.mapping.file(input.type, output.type, species, SBGNview.data.folder = SBGNview.data.folder)
#   
#   if(file.exists(mapping.file.name) | tryCatch(data(list = mapping.file.name), 
#                                                warning = function(w) "no such data") %in% c(mapping.file.name, "mapping.list", "mapping.table" )
#   ){
#     message("Loading ID mapping file: ",mapping.file.name," \n")
#     .GlobalEnv$mapping.list = NULL; .GlobalEnv$mapping.table = NULL
#     if.from.file = FALSE
#     if(file.exists(mapping.file.name)){
#       if.from.file = TRUE
#       var.name = load(mapping.file.name)
#     }else{
#       if(!exists(mapping.file.name)){
#         data(list = mapping.file.name)
#       }
#     }
#     if(!is.null(mapping.list)){ # if one of "output.type" or "input.type" is NOT "pathway.id", then the data's names is "mapping.list"
#       id.map = mapping.list[[1]][[1]]
#     }else if(!is.null(mapping.table)){ # if one of "output.type" or "input.type" is "pathway.id", then the data's names is "mapping.table"
#       id.map = mapping.table
#     }else if(if.from.file){ # if one of "output.type" or "input.type" is "pathway.id", then the data's names is "mapping.table"
#       id.map = get(var.name)
#     }else{
#       # print("loading mapping file from data package")
#       # print(mapping.file.name)
#       # data(list = mapping.file.name)
#       id.map = get(mapping.file.name)
#     }
#     message("Finished loading")
#     
#     if("species" %in% colnames(id.map) &  !is.null(id.map) ){
#       if(any(id.map[,"species"] %in% species) ){
#         id.map = id.map[id.map[,"species"] %in% species,c(input.type,output.type)]
#       }
#     }
#   } else {
#     if(cpd.or.gene == "gene"){
#       # pathway.id used for heterogeneous purposes and mapping gene to pathway
#       # pathwayCommons and metacyc.SBGN are databases mentioned in pathways.stats
#       # unique(pathways.info$macromolecule.ID.type) returns "pathwayCommons" "metacyc.SBGN"  "ENZYME"  
#       # ENZYME not in if condition below b/c it can be mapped directly 
#       if(any(c(input.type,output.type) %in% c("pathwayCommons","metacyc.SBGN","pathway.id")) & 
#          !any(c(input.type,output.type) %in% c("KO")) 
#       ){ # if input and output don't contain KO
#         
#         # load mapping table for mapping KO to glyph id
#         ko.to.glyph.id = loadMappingTable(input.type = output.type,
#                                           output.type = "KO",
#                                           cpd.or.gene = "gene",
#                                           species = species,
#                                           SBGNview.data.folder = SBGNview.data.folder)
#         ko.to.glyph.id = ko.to.glyph.id[[1]][[1]]
#         
#         # load mapping table for mapping user input to KO id
#         input.to.ko = loadMappingTable(input.type = input.type,
#                                        output.type = "KO",
#                                        cpd.or.gene = "gene",
#                                        limit.to.ids = limit.to.ids,
#                                        species = species,
#                                        SBGNview.data.folder = SBGNview.data.folder)
#         input.to.ko = input.to.ko$gene[[1]]
#         input.to.glyph.id = merge(input.to.ko,ko.to.glyph.id,all= FALSE)
#         id.map = input.to.glyph.id[,c(input.type,output.type)]
#       } else {
#         message("\nID mapping not pre-generated. Using Pathview!!!\n")
#         id.map = geneannot.map.ko(in.ids = limit.to.ids,
#                                   in.type = input.type,
#                                   out.type = output.type,
#                                   species = species,
#                                   unique.map= FALSE,
#                                   SBGNview.data.folder = SBGNview.data.folder)
#         if(is.vector(id.map)){
#           id.map = as.matrix(t(id.map))
#         }
#       }
#     } else if(cpd.or.gene == "compound"){
#       message("\nID mapping not pre-generated. Using Pathview cpd!!!\n")
#       if(is.null(limit.to.ids)){
#         stop("Must provide input IDs when using pathview mapping!")
#       }
#       print(input.type)
#       print(output.type)
#       id.map =  pathview::cpdidmap(in.ids = limit.to.ids, in.type = input.type, 
#                                    out.type = output.type)
#     } else {
#       message("\nCouldn't fine ID mapping table between ", input.type, " and ", output.type, "!!!\n")
#       #message("Tried online resource ", online.mapping.file, "\n")
#       message("Please provide ID mapping table using \"id.mapping.table\"!!\n")
#       stop("ID mapping table not provided")
#     }
#     
#     id.map = id.map[!is.na(id.map[,2]),]
#     if(is.vector(id.map)){
#       id.map = as.matrix(t(id.map))
#     }
#     id.map = id.map[!is.na(id.map[,1]),]
#     if(is.vector(id.map)){
#       id.map = as.matrix(t(id.map))
#     }
#     id.map = unique(id.map)
#   }
#   
#   # if(!is.null(limit.to.ids)){
#   #     # some IDs are quoted, need to remove the quote characters
#   #     id.map[,input.type] = gsub("^\"","",id.map[,input.type])
#   #     id.map[,input.type] = gsub("\"$","",id.map[,input.type])
#   #     id.map = id.map[id.map[,input.type] %in% limit.to.ids,]
#   # }
#   
#   # add additional mapping using KO to glyph.id
#   mapping.list = list()
#   if(is.vector(id.map)){
#     id.map = as.matrix(t(id.map))
#   }
#   id.map[,1] = as.character(id.map[,1])
#   id.map[,2] = as.character(id.map[,2])
#   id.map = as.matrix(id.map)
#   mapping.list[[cpd.or.gene]]= list()
#   mapping.list[[cpd.or.gene]][[type.pair.name]] = id.map
#   message("Generated ID mapping list")
#   
#   return(mapping.list)
# }

#########################################################################################################
### old version of function
# geneannot.map.ko <- function(in.ids = NULL, in.type, out.type, species = "all", unique.map, 
#     SBGNview.data.folder = "./SBGNview.tmp.data") {
#     # pathview's geneannot.map can't map KO, so here included KO mapping
#     if (!is.null(in.ids)) {
#         in.ids <- gsub("\"", "", in.ids)
#     }
#     message("\nusing pathview for id mapping: ", in.type, " to ", out.type, "\n\n")
#     
#     if (any(c(in.type, out.type) %in% "KO")) {
#         filter.type <- in.type
#         out.type <- setdiff(c(in.type, out.type), "KO")
#         in.type <- "KO"
#         
#         # break endless loop - this loop doesn't allow the rest of the code to run
#         # to break this, we need a mapping list. Since one doesn't exit, we generate mapping on the fly
#         # use KEGG API. KEGGREST Bioconductor Pkg
#         mapping.list <- loadMappingTable(input.type = "KO", output.type = "ENTREZID", 
#             cpd.or.gene = "gene", species = species, SBGNview.data.folder = SBGNview.data.folder)
#         #
#         message("loaded mapping list")
#         
#         mapping.table <- mapping.list[[1]][[1]]
#         if (out.type %in% c("ENTREZID", "ez", "entrezid")) {
#             print("filter species")
#             id.map <- mapping.table[mapping.table[, "species"] == species, c(in.type, 
#                 out.type)]
#             if (!is.null(in.ids)) {
#                 mapping.table <- mapping.table[mapping.table[, filter.type] %in% 
#                   in.ids, ]
#             }
#         } else {
#             if (species == "mmu") {
#                 species <- "mouse"
#             }
#             output.to.eg <- pathview::eg2id(eg = mapping.table[, "ENTREZID"], category = out.type, 
#                 org = species, unique.map = unique.map)
#             
#             output.to.ko <- merge(output.to.eg, mapping.table, all.x = TRUE)
#             id.map <- output.to.ko[, c("KO", out.type)]
#             id.map <- id.map[!is.na(id.map[, out.type]), ]
#             # filter to output only input IDs
#             if (!is.null(in.ids)) {
#                 id.map <- id.map[id.map[, filter.type] %in% in.ids, ]
#             }
#         }
#     } else {
#         if (species == "mmu") {
#             species <- "mouse"
#         }
#         if (is.null(in.ids)) {
#             stop("Must provide input IDs when using pathview mapping!")
#         }
#         
#         # id.map <- geneannot.map.all(in.ids = in.ids, in.type = in.type, out.type = out.type, 
#         #     org = species, unique.map = unique.map)
#         print(out.type)
#         id.map <- pathview::geneannot.map(in.ids = in.ids, 
#                                           in.type = in.type, 
#                                           out.type = out.type,
#                                           org = species, 
#                                           unique.map = unique.map)
#     }
#     return(id.map)
# }

#########################################################################################################
### this function was previously used in the older version geneannot.map.ko
### the functionality provided by this function was replaced with 
### pathview::geneannot.map in the updated geneannot.map.ko function 
# geneannot.map.all <- function(in.ids, in.type, out.type, org = "Hs", pkg.name = NULL, 
#                               unique.map = TRUE, na.rm = TRUE, keep.order = FALSE) {
#   
#   if (is.null(pkg.name)) {
#     # pkg.name=paste('org', org, 'eg.db', sep='.')
#     message("data bods")
#     data(bods)
#     ridx <- grep(tolower(paste0(org, "[.]")), tolower(bods[, 1]))
#     if (length(ridx) == 0) {
#       ridx <- grep(tolower(org), tolower(bods[, 2:3]))%%nrow(bods)
#       if (length(ridx) == 0) 
#         stop("Wrong org value!")
#       if (any(ridx == 0)) 
#         ridx[ridx == 0] <- nrow(bods)
#     }
#     pkg.name <- bods[ridx, 1]
#   }
#   all.mappings <- character(2) 
#   message("all mappings: ", all.mappings, " items")
#   for (i in seq_len(length.out = nrow(bods))) {
#     if (bods[i, 1] != pkg.name) {
#       (next)()
#     }
#     pkg.name <- bods[i, 1]
#     
#     if (!pkg.name %in% rownames(installed.packages())) { 
#       #(next)() # if package not installed, evals to True, but doesn't install pkg.
#       # install pkg.name
#       message(pkg.name, " is not installed. Installing ", pkg.name)
#       BiocManager::install(pkg.name, update=FALSE)
#       
#     }
#     message("Using package: ", pkg.name, "\n")
#     db.obj <- eval(parse(text = paste0(pkg.name, "::", pkg.name)))
#     id.types <- AnnotationDbi::columns(db.obj)  #columns(eval(as.name(pkg.name)))
#     
#     in.type <- toupper(in.type)
#     out.type <- toupper(out.type)
#     eii <- in.type == toupper("entrez") | in.type == toupper("eg")
#     if (any(eii)) 
#       in.type[eii] <- "entrez"
#     eio <- out.type == toupper("entrez") | out.type == toupper("eg")
#     if (any(eio)) 
#       out.type[eio] <- "entrez"
#     if (in.type == out.type) 
#       stop("in.type and out.type are the same, no need to map!")
#     
#     nin <- length(in.type)
#     if (nin != 1) 
#       stop("in.type must be of length 1!")
#     out.type <- out.type[!out.type %in% in.type]
#     nout <- length(out.type)
#     
#     msg <- paste0("must from: ", paste(id.types, collapse = ", "), "!")
#     if (!in.type %in% id.types) 
#       stop("'in.type' ", msg)
#     if (!all(out.type %in% id.types)) 
#       stop("'out.type' ", msg)
#     
#     in.ids0 <- in.ids
#     in.ids <- unique(as.character(in.ids))  #unique necessary for select()# if(unique.map)
#     out.ids <- character(length(in.ids))
#     res <- try(suppressWarnings(AnnotationDbi::select(db.obj, keys = in.ids, 
#                                                       keytype = in.type, columns = c(in.type, out.type))))
#     all.mappings <- rbind(all.mappings, res)
#   }
#   res <- as.data.frame(all.mappings[-1, ])
#   
#   res <- res[, c(in.type, out.type)]
#   
#   na.idx <- is.na(res[, 2])
#   if (sum(na.idx) > 0) {
#     n.na <- length(unique(res[na.idx, 1]))
#     if (n.na > 0) {
#       print(paste("Note:", n.na, "of", length(in.ids), "unique input IDs unmapped."))
#     }
#     if (na.rm) 
#       res <- res[!na.idx, ]
#   }
#   
#   cns <- colnames(res)
#   
#   res <- as.matrix(res)
#   rownames(res) <- NULL
#   return(res)
# }

#########################################################################################################
datapplab/SBGNview documentation built on June 20, 2022, 9:55 p.m.