###################################################
## SQLite Structure Database for Drugs from CMAP ##
###################################################
## Author: Thomas Girke
## Last update: 13-May-16
## Obtaining the structures and PubChem IDs for CMAP was much harder than expected since
## cmap only provides inconsistently formatted compound names and order numbers. The following
## documents the workflow you used.
#' Build CMAP Database
#'
#' This function builds a SQLite database named as 'cmap.db' that contains id
#' mappings of cmap names to PubChem/DrugBank IDs as well as compound structure information.
#'
#' For about 2/3 of the CMAP drugs, one can obtain their PubChem/DrugBank IDs from
#' the DMAP site here: http://bio.informatics.iupui.edu/cmaps. Since this website is no
#' longer supported, the processed CMAP name to PubChem and DrugBank ID mapping table
#' is stored under the "inst/extdata" folder of this package named as "dmap_unique.txt".
#' The SMILES strings for CMAP entries were obtained from ChemBank. Compounds
#' were matched by names using the 'stringdist' library where cmap_name from
#' CMAP were mapped to the closest name in ChemBank.
#' @param dest_dir character(1), destination directory under which the result
#' SQLite database named as 'cmap.db' stored. The default is user's current
#' working directory.
#' @return write "cmap.db" SQLite database to the destination directory defined by user.
#' @import ChemmineR
#' @importFrom utils download.file
#' @importFrom utils read.delim
#' @importFrom utils write.table
#' @importFrom methods as
#' @importFrom stats na.omit
#' @examples
#' library(ChemmineR)
#' ## Query database
#' # buildCMAPdb(dest_dir="./inst/scripts")
#' # conn <- initDb("/inst/scripts/cmap.db")
#' # results <- getAllCompoundIds(conn)
#' # sdfset <- getCompounds(conn, results, keepOrder=TRUE)
#' # sdfset
#' # as.data.frame(datablock2ma(datablock(sdfset)))[1:4,]
#' # myfeat <- listFeatures(conn)
#' # feat <- getCompoundFeatures(conn, results, myfeat)
#' # feat[1:4,]
buildCMAPdb <- function(dest_dir=".") {
## Join DMAP and CMAP tables
dmap_path <- system.file("extdata/dmap_unique.txt", package="customCMPdb")
dmap <- read.delim(dmap_path)
row.names(dmap) <- tolower(dmap$SOURCE_DRUG)
cmap_inst <- system.file("extdata/cmap_instances_02.txt", package="customCMPdb")
cmap <- read.delim(cmap_inst)
cmap <- cmap[!duplicated(tolower(cmap$cmap_name)),]
row.names(cmap) <- tolower(cmap$cmap_name)
df <- data.frame(cmap, dmap[row.names(cmap),])
# sum(!is.na(df$SOURCE_DRUG))
# length(unique(na.omit(df$SOURCE_DRUG)))
# 867 cmap drugs have PubChem IDs; 442 do not have;
# instance_id 1345 and 2952 with two cmap_name "betulinic acid" and "betulin"
# but the same SOURCE_DRUG "Betulinic Acid"
## Obtain SMILES strings for CMAP entries from ChemBank
## Tyler did this with help from P. Clemens from ChemBank.
## Compounds were matched by names using the stringdist library
## (here cmap_name from CMAP and the closest name in ChemBank).
## This is the location of the input files:
## /rhome/tbackman/Projects/cmap_drugs/src/mapIDs.R
## /rhome/tbackman/Projects/cmap_drugs/working/cmap_instances_02.xls
## /rhome/tbackman/Projects/cmap_drugs/working/allcompounds.smiles
## /rhome/tbackman/Projects/cmap_drugs/working/chembank-synonyms.txt
## /rhome/tbackman/Projects/cmap_drugs/working/chembank-structures.txt
## /rhome/tbackman/Projects/cmap_drugs/working/smilesMatches.tsv
## Note: file smilesMatches.tsv was generated by Tyler, see above
smipath <- system.file("extdata/smilesMatches.tsv", package="customCMPdb")
smiMA <- read.csv(smipath)
row.names(smiMA) <- tolower(smiMA$cmap_name)
bothDF <- cbind(df, smiMA[rownames(df),
c("chembank_id", "chembank_name", "match_distance", "smiles")])
#dim(bothDF[as.integer(bothDF[,"match_distance"]) == 0,]) # 1223 x 28
# Note: compounds with match_distance > 0 need to be checked
## Create SDFset
#library(ChemmineR); library(ChemmineOB) # requires openbabel module
smi <- as.character(bothDF$smiles)
names(smi) <- as.character(bothDF$cmap_name)
smi <- as(smi, "SMIset")
sdfset <- smiles2sdf(smi)
datablock(sdfset) <- bothDF # Stores annotation info in datablock slots
## Create SQLite database
standardFeatures <- function(sdfInput) {
data.frame(propOB(sdfInput),
Ncharges=sapply(bonds(sdfInput, type="charge"), length),
as.data.frame(groups(sdfInput, type="countMA")),
as.data.frame(rings(sdfInput, upper=8, type="count", arom=TRUE)))
}
conn <- initDb(paste0(dest_dir, "/cmap.db"))
ids <- loadSdf(conn, sdfset, fct=standardFeatures)
dbDisconnect(conn)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.