cstrsplitSym=NA
.onLoad <- function(libname,pkgname) {
#message("libname: ",libname)
#message("pkgname: ",pkgname)
cstrsplitSym <<- getNativeSymbolInfo("cstrsplit")
}
.db.header.size <- 16
# supported elements
.elements <- list(
R=0,
H=1,
He=2,
Li=3,
Be=4,
B=5,
C=6,
N=7,
O=8,
F=9,
Ne=10,
Na=11,
Mg=12,
Al=13,
Si=14,
P=15,
S=16,
Cl=17,
Ar=18,
K=19,
Ca=20,
Sc=21,
Ti=22,
V=23,
Cr=24,
Mn=25,
Fe=26,
Co=27,
Ni=28,
Cu=29,
Zn=30,
Ga=31,
Ge=32,
As=33,
Se=34,
Br=35,
Kr=36,
Rb=37,
Sr=38,
Y=39,
Zr=40,
Nb=41,
Mo=42,
Tc=43,
Ru=44,
Rh=45,
Pd=46,
Ag=47,
Cd=48,
In=49,
Sn=50,
Sb=51,
Te=52,
I=53,
Xe=54,
Cs=55,
Ba=56,
La=57,
Ce=58,
Pr=59,
Nd=60,
Pm=61,
Sm=62,
Eu=63,
Gd=64,
Tb=65,
Dy=66,
Ho=67,
Er=68,
Tm=69,
Yb=70,
Lu=71,
Hf=72,
Ta=73,
W=74,
Re=75,
Os=76,
Ir=77,
Pt=78,
Au=79,
Hg=80,
Tl=81,
Pb=82,
Bi=83,
Po=84,
At=85,
Rn=86,
Fr=87,
Ra=88,
Ac=89,
Th=90,
Pa=91,
U=92,
Np=93,
Pu=94,
Am=95,
Cm=96,
Bk=97,
Cf=98,
Es=99,
Fm=100,
Md=101,
No=102,
Lr=103,
Rf=104,
Db=105,
Sg=106,
Bh=107,
Hs=108,
Mt=109,
Ds=110,
Rg=111)
# similarity model
# input: descriptors for two compounds. both are vectors
# output: similarity
.cmp.similarity <- function(a, b, mode=1, worst=0,
dbcon.a=NULL, db.intsize.a=4,
dbcon.b=NULL, db.intsize.b=4)
{
## ThG: added for compatability with new S4 classes APset/AP
if(inherits(a,"APset")) { a <- ap(a)[[1]] }
if(inherits(b,"APset")) { b <- ap(b)[[1]] }
if(inherits(a,"AP")){ a <- ap(a) }
if(inherits(b,"AP")){ b <- ap(b) }
## ThG: end of lines
# check argument, make sure they are vectors and not lists
if (is.character(a) && length(a) == 3 && a[[1]] == 'filedb:')
a <- .load.file.backed.desc(a, dbcon.a, db.intsize.a)
if (is.character(b) && length(b) == 3 && b[[1]] == 'filedb:')
b <- .load.file.backed.desc(b, dbcon.b, db.intsize.b)
if (is.null(b) || is.null(a))
return(0)
if (!is.numeric(b) || !is.numeric(a) ) {
stop("Both arguments must be AP/APset objects or descriptors in the form of vectors.\n",
"Did you use \"[]\" instead of \"[[]]\" to index the descriptor ",
"database?")
}
if (mode == 1 && worst != 0) {
# estimate upper bound
i <- length(a)
j <- length(b)
u_bond <- min(i, j) / max(i, j)
if (u_bond < worst)
return(0)
}
if (mode == 0) {
# if no mode is given, guess a mode based on compound size
# difference
if (length(a) < .25 * length(b) || length(a) > 4 * length(b))
mode <- 3
else
mode <- 1
}
if (mode == 1)
return (length(intersect(a, b)) / length(union(a, b)))
else if (mode == 2)
return (length(intersect(a, b)) / min(length(a), length(b)))
else if (mode == 3) {
s <- length(intersect(a, b)) / min(length(a), length(b))
return (s^3)
}
}
cmp.similarity <- function(a, b, mode=1, worst=0)
{
.cmp.similarity(a, b, mode=mode, worst=worst)
}
cmp.parse1 <- function(filename) sdf2ap(read.SDFset(filename))[[1]]
cmp.parse <- function(filename) apset2descdb(sdf2ap(read.SDFset(filename)))
# search db for similar compounds. `db' gives the database returned by
# `cmp.parse' or `sdfs_to_desc'. `query' gives the descriptor of one compound,
# either returned by `cmp.parse1/sdf_to_desc' or a reference to one instance in
# db such as db$descdb[[123]] two types of cutoff: score cutoff (<=1) or count
# cutoff (>1)
cmp.search <- function(db, query, type=1, cutoff=0.5, return.score=FALSE, quiet=FALSE,
mode=1, visualize=FALSE, visualize.browse=TRUE, visualize.query=NULL)
{
## ThG: added for compatability with new S4 classes APset/AP
## Note: type argument was also added (has no impact on old list object).
dbtype <- as.character(class(db))
if(dbtype=="APset") { db <- apset2descdb(db) }
if(inherits(query,"APset")) query <- ap(query[[1]])
if(inherits(query,"AP")) query <- ap(query)
## ThG: end of lines
if (.is.file.backed.desc(query))
query <- .load.file.backed.desc(query)
dbcon <- NULL
intsize <- 4
if (db$type == 'file-backed') {
for (i in 1:length(db$descdb)) {
if (.is.file.backed.desc(db$descdb[[i]])) {
dbcon <- file(paste(db$descdb[[i]][[2]], '.cdb', sep=''), 'rb')
seek(dbcon, 16)
intsize <- readBin(dbcon, integer(), n=1, size=1)
break
}
}
}
scores <- array()
ids <- array()
pos <- 1 # tail of entries
size <- length(db$cids)
perstep <- round(size / 25)
for (i in 1:size) {
if (!quiet && i %% perstep == 0) {
steps <- i / perstep
.progress_bar(paste(min(steps*4, 100), "%", collapse=""))
}
cmp <- db$descdb[[i]]
if (db$type == 'file-backed' && .is.file.backed.desc(db$descdb[[i]]))
cmp <- .load.file.backed.desc(cmp, dbcon, intsize)
if (cutoff <= 1)
score <- .cmp.similarity(cmp, query, mode=mode, worst=cutoff)
else if (pos - 1 < cutoff)
score <- .cmp.similarity(cmp, query, mode=mode, worst=0)
else
score <- .cmp.similarity(cmp, query, mode=mode,
worst=scores[[pos - 1]])
if (cutoff <= 1) {
if (score >= cutoff) {
scores[[pos]] <- score
ids[[pos]] <- i
pos <- pos + 1
}
} else {
# using naive insertion sort
if (pos - 1 < cutoff)
pos <- pos + 1
else if (scores[[pos - 1]] >= score)
next
pos_ <- pos - 2
while (pos_ > 0 && scores[[pos_]] < score) {
scores[[pos_ + 1]] <- scores[[pos_]]
ids[[pos_ + 1]] <- ids[[pos_]]
pos_ <- pos_ - 1
}
pos_ <- pos_ + 1
scores[[pos_]] <- score
ids[[pos_]] <- i
}
}
if (!is.null(dbcon)) close(dbcon)
if (!quiet) cat("\n")
if (cutoff <= 1) {
order <- sort(scores, index=TRUE)[["ix"]]
order <- order[length(order):1]
} else
order <- 1:length(scores)
## ThG: modified/added for compatability with new S4 classes APset/AP
if (return.score & dbtype=="list") {
return(data.frame(ids=ids[order], scores=scores[order]))
}
if (!return.score & dbtype=="list") {
return(ids[order])
}
if (dbtype=="APset") {
if(type==1) {
return(ids[order])
}
if(type==2) {
index <- scores[order]
names(index) <- db$cids[ids[order]]
return(index)
}
if(type==3) {
return(data.frame(index=ids[order], cid=db$cids[ids[order]], scores=scores[order]))
}
}
## ThG: end of lines
}
# segment SDFs. given indices of selected compounds, return the concatenation
# of SDFs for them
sdf.subset <- function(db, cmps)
{
if (!grep('://', db$source) && !file.exists(db$source)) {
stop("cannot find the source SDF file for this database\n")
}
output <- ""
for (i in cmps)
{
sdf <- .extract_single_sdf(db, i)
output <- paste(output, sdf, sep="")
}
return(output)
}
## ThG: function is obsolete for APset class which works with standard R subsetting utilities.
db.subset <- function(db, cmps)
{
return(list(descdb=db$descdb[cmps], cids=db$cids[cmps],
sdfsegs=db$sdfsegs[cmps], source=db$source))
}
# helper function for subset_sdf. This function takes an index, and return the
# SDF correponding compound at that index
.extract_single_sdf <- function(db, index)
{
if (index > length(db$sdfsegs))
stop("Index out of range in .extract_single_sdf")
skip <- db$sdfsegs[[index]]
con <- file(db$source, open="r")
if (skip > 0)
#scan(con, "raw", skip=skip-1, nlines=1, quiet=TRUE)
readLines(con, skip)
sdf <- .read.one.sdf(con, collapse=FALSE)
# replace line 1 with compound index if no name is available
if (length(grep("[a-zA-z0-9]", sdf[[1]])) == 0)
sdf[[1]] <- paste("ChemmineR_Unnamed_Compound_", index, sep="")
# close connection
close(con)
return(paste(sdf, sep="", collapse="\n"))
}
# explain an atom pair descriptor
db.explain <- function(desc)
{
## ThG: added for compatability with new S4 classes APset/AP ##
if(inherits(desc,"APset")) desc <- ap(desc[[1]])
if(inherits(desc,"AP")) desc <- ap(desc)
## ThG: end of lines ##
if (is.character(desc)) {
if (.is.file.backed.desc(desc))
desc <- .load.file.backed.desc(desc)
else
stop("The descriptor(s) to be explained is not valid.")
}
if (length(desc) == 1)
return(.explain(desc))
else {
result <- c()
for (i in 1:length(desc))
result <- c(result, .explain(desc[[i]]))
return(result)
}
}
.explain <- function(desc)
{
# numbers of appearance
occurence <- desc %% 2^7 + 1
desc <- desc %/% 2^7
# extract left atom, distance, and right atom
left <- desc %/% 2^20
desc <- desc %% 2^20
dist <- desc %/% 2^13
desc <- desc %% 2^13
right <- desc
# atom properties
left <- .interpret_atom(left)
right <- .interpret_atom(right)
# output
return (paste(left$symbol, " [", left$neighbors, " neighbor(s),",
left$pi_electrons, " pi electrons]<---",
dist,
"--->", right$symbol, " [", right$neighbors,
" neighbor(s),", right$pi_electrons, " pi electrons]",
collapse="", sep=""))
}
# interpret an atom descriptor
# see also: .gen_atom_desc
.interpret_atom <- function(atom_desc)
{
# atom
atom_symbol <- names(.elements[.elements == atom_desc %/% 2^6])
# neighbors
neighbors <- atom_desc %% 2^6 %/% 2^3
# pi electron
pi_electrons <- atom_desc %% 2^3
return(list(symbol=atom_symbol, neighbors=neighbors,
pi_electrons=pi_electrons))
}
# convert a factor to a vector to simplify Union and Intersection operations.
# It basically eliminate repeated names by adding suffix. For example, if `a'
# appears twice in the factor, it will be converted to `a1' and `a2'.
# this is an internal function and is not intended to be used by users
.factor_to_vector <- function(fac)
{
vec <- vector(mode='numeric')
index <- 1
freq <- table(fac)
keys <- names(freq)
for (i in 1:length(keys)) {
key <- keys[[i]]
if (freq[[key]] == 1) {
vec[[index]] <- as.numeric(key) * 2^7
index <- index + 1
} else {
for (j in 1:freq[[key]]) {
vec[[index]] <- as.numeric(key) * 2^7 + j - 1
index <- index + 1
}
}
}
return(vec)
}
.uniquifyAtomPairs <- function(desc) {
.Call("uniquifyAtomPairs",desc)
}
.progress_bar_int_cnt <- 0
.progress_bar <- function(label=NULL) {
progress <- c("|", "/", "-", "\\")
unlockBinding(".progress_bar_int_cnt", environment(.progress_bar))
.progress_bar_int_cnt <<- .progress_bar_int_cnt %% 4 + 1
if (is.null(label)) {
cat("\r", progress[[.progress_bar_int_cnt]])
} else {
cat("\r \r")
cat(progress[[.progress_bar_int_cnt]], label)
}
}
.postToHost <- function(host, path, data.to.send, port=80)
{
if(missing(path)) path <- "/"
ua <- "User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.1)"
boundary <- paste(
as.integer(runif(20, min=1, max=10)), collapse=""
)
header <- NULL
header <- c(header, paste("POST ", path, " HTTP/1.1", sep=""))
header <- c(header, paste("Host: ", host, sep=""))
header <- c(header, "Connection: close")
header <- c(header, ua)
header <- c(header, "Accept: */*")
header <- c(header,
paste("Content-type: multipart/form-data; boundary=",
boundary, sep="")
)
boundary <- paste('--', boundary, sep='')
mcontent <- NULL # keeps the content.
size <- 0
for (i in 1:length(data.to.send)) {
value <- data.to.send[[i]]
key <- names(data.to.send)[i]
content <- paste(
boundary, "\n",
"Content-Disposition: form-data; name=\"",
as.character(key), "\"", "\n",
"\n",
as.character(value), "\n",
sep=""
)
mcontent <- c(mcontent, charToRaw(content))
}
size <- length(mcontent) + length(strsplit(boundary, "")[[1]]) + 4
header <- c(header, paste("Content-length: ", size, "\n\n", sep=""))
postdata <- c(
charToRaw(paste(header, collapse="\n")),
mcontent,
charToRaw(paste(boundary,"--\n\n",sep=""))
)
rm(header,mcontent)
scon <- socketConnection(host=host,port=port,open="a+b",blocking=TRUE)
writeBin(postdata, scon, size=1)
output <- character(0)
repeat{
ss <- rawToChar(readBin(scon, "raw", 2048))
output <- paste(output,ss,sep="")
if(regexpr("\r\n0\r\n\r\n",ss)>-1) break()
if(ss == "") break()
}
close(scon)
return(output)
}
.read.one.sdf <- function(filename, collapse=TRUE)
{
if (length(filename) != 1)
stop('reference sdf must be a vector of only one entry!')
if (!("connection" %in% class(filename)) && !is.character(filename))
stop('reference sdf must be an SDF file or one character string of sdf')
if ("connection" %in% class(filename))
con <- filename
# is filename really a filename a the SDF content? check existence of \n
else if (length(grep("\n", filename)) == 0 &&
length(grep("\r", filename)) == 0)
con <- file(filename, open="r")
else
con <- textConnection(filename)
result <- c()
while (TRUE) {
line <- readLines(con=con, n=1)
if (length(line) == 0 || "$$$$" %in% line) {
if (length(result) == 0)
stop("empty SDF!")
result <- c(result, '$$$$\n')
if (collapse)
return (paste(result, collapse="\n"))
else
return (result)
} else
result <- c(result, line)
}
}
.data.frame.to.str <- function(x)
{
con <- textConnection("string", "w")
write.table(format(x), con, row.names=F, quote=F)
close(con)
return(paste(string, collapse='\n'))
}
.is.file.backed.desc <- function(desc)
{
is.character(desc) && length(desc) == 3 && desc[[1]] == 'filedb:'
}
.load.file.backed.desc <- function(desc, dbcon=NULL, intsize=4)
{
dbname <- desc[[2]]
pos <- desc[[3]]
closeme <- FALSE
if (is.null(dbcon)) {
db_a <- paste(dbname, '.cdb', sep='')
if (!file.exists(db_a)) stop("cannot find database file ", db_a)
dbcon <- file(db_a, 'rb')
closeme <- TRUE
seek(dbcon, .db.header.size)
intsize <- readBin(dbcon, integer(), n=1, size=1)
}
seek(dbcon, as.integer(pos))
size <- readBin(dbcon, integer(), n=1, size=intsize)
desc <- readBin(dbcon, integer(), n=size, size=intsize)
if (closeme) close(dbcon)
.factor_to_vector(as.factor(desc))
}
.haveOB <- function()
{
if(suppressWarnings(require('ChemmineOB', quietly=T))
&& ChemmineOB:::.supportedPlatform()) {
return(TRUE)
}else{
return(FALSE)
}
}
.ensureOB <- function(mesg = paste("ChemmineOB is required to make use of this function.",
"This package can be installed from BioConductor with the ",
"command 'BiocManager::install(\"ChemmineOB\"). ",
"It is not currently available for windows however.",
"See http://bioconductor.org/packages/devel/bioc/html/ChemmineOB.html",
"for more information"))
{
if(!.haveOB())
stop(mesg)
}
##############################################
## PubChem Fingerprint Similarity Searching ##
##############################################
## Convert base 64 encoded PubChem fingerprints to binary matrix or string vector
## Definition of PubChem's substructure dictionary-based fingerprints:
## ftp://ftp.ncbi.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
## For debugging use command: as.integer(rev(intToBits("19")[1:6]))
## Load PubChem's substructure dictionary from data dir of library
data(pubchemFPencoding); pubchemFPencoding <- pubchemFPencoding
fp2bit <- function(x, type=3, fptag="PUBCHEM_CACTVS_SUBSKEYS") {
## Covert base 64 strings to matrix
if(inherits(x,"SDFset")) {
blockmatrix <- datablock2ma(datablocklist=datablock(x))
fp <- blockmatrix[, fptag]
}
if(is.matrix(x)) {
blockmatrix <- x
fp <- blockmatrix[, fptag]
}
if(is.character(x)) {
fp <- x
}
fpma <- unlist(strsplit(fp, ""))
fpma <- matrix(fpma, length(fp), nchar(fp[1]), byrow=TRUE)
fpma <- fpma[, 1:154] # remove padding signs '='
## base 64 decoding (base 64 alphabet from http://www.faqs.org/rfcs/rfc3548.html)
base64 <- c(A=0,B=1,C=2,D=3,E=4,F=5,G=6,H=7,I=8,J=9,K=10,L=11,M=12,N=13,O=14,P=15,
Q=16,R=17,S=18,T=19,U=20,V=21,W=22,X=23,Y=24,Z=25,a=26,b=27,c=28,d=29,
e=30,f=31,g=32,h=33,i=34,j=35,k=36,l=37,m=38,n=39,o=40,p=41,q=42,r=43,
s=44,t=45,u=46,v=47,w=48,x=49,y=50,z=51,"0"=52,"1"=53,"2"=54,"3"=55,
"4"=56,"5"=57,"6"=58,"7"=59,"8"=60,"9"=61,"+"=62,"/"=63)
fpbitma <- as.integer(intToBits(base64[as.vector(t(fpma))]))
fpbitma <- matrix(fpbitma, length(fpma[,1])*154, 32, byrow=TRUE)[,6:1]
fpbitma <- matrix(t(fpbitma), length(fpma[,1]), 6*154, byrow=TRUE)[,33:913]
pubchemFP <- pubchemFPencoding[,2]
names(pubchemFP) <- pubchemFPencoding[,1]
colnames(fpbitma) <- pubchemFP
rownames(fpbitma) <- names(fp)
if(type==1) {
return(apply(fpbitma, 1, paste, collapse=""))
}
if(type==2) {
return(fpbitma)
}
if(type==3) {
#return(as(fpbitma, "FPset"))
return(new("FPset",fpma=fpbitma,type="pubchem"))
}
}
## Fingerprint comparison and similarity search function
fpSimOrig <- function(x, y, sorted=TRUE, method="Tanimoto", addone=1, cutoff=0, top="all", alpha=1, beta=1, ...) {
## Predefined similarity methods
if(is.character(method)) {
if(method=="Tanimoto" | method=="tanimoto")
method <- function(a,b,c,d) (c+addone)/(a+b+c+addone)
else if(method=="Euclidean" | method=="euclidean")
method <- function(a,b,c,d) sqrt((c+d+addone)/(a+b+c+d+addone))
else if(method=="Tversky" | method=="tversky")
method <- function(a,b,c,d) (c+addone)/(alpha*a + beta*b+c+addone)
else if(method=="Dice" | method=="dice")
method <- function(a,b,c,d) (2*c+addone)/(a+c+b+c+addone)
}
## Check for valid inputs
if(!any(c(is.vector(x), inherits(x,"FP"), inherits(x,"FPset") & length(x)==1))) stop("x needs to be object of class FP, FPset of length one, or vector")
if(!any(c(is.vector(y), is.matrix(y), inherits(y,"FP"), inherits(y,"FPset") ))) stop("y needs to be object of class FP/FPset, vector or matrix")
## Convert FP/FPset inputs into vector/matrix format
if(inherits(x,"FP")) x <- as.numeric(x)
if(inherits(x,"FPset")) x <- as.numeric(x[[1]])
if(inherits(y,"FP")) y <- as.numeric(y)
if(inherits(y,"FPset")) y <- as.matrix(y)
## Computate similarities
if(is.matrix(y)) {
c <- colSums((t(y) + x) == 2)
d <- colSums((t(y) + x) == 0)
b <- rowSums(y) - c
} else {
c <- sum(x + y == 2)
d <- sum(x + y == 0)
b <- sum(y) - c
}
a <- sum(x) - c
if(sorted==TRUE) {
tmp <- rev(sort(method(a,b,c,d)))
if(top!="all") tmp <- tmp[1:top]
if(top!=1) {
return(tmp[tmp>=cutoff])
} else {
return(tmp) # returns always a value when top=1; required for compatibility with cmp.cluster
}
}
if(sorted==FALSE) {
tmp <- method(a,b,c,d)
if(top!=1) {
return(tmp[tmp>=cutoff])
} else {
return(tmp) # returns always a value when top=1; required for compatibility with cmp.cluster
}
}
}
fpSim <- function(x, y, sorted=TRUE, method="Tanimoto",
addone=1, cutoff=0, top="all", alpha=1, beta=1,
parameters=NULL,scoreType="similarity") {
if(is.character(method)) {
if(method=="Tanimoto" | method=="tanimoto")
method <- 0
else if(method=="Euclidean" | method=="euclidean")
method <- 1
else if(method=="Tversky" | method=="tversky")
method <- 2
else if(method=="Dice" | method=="dice")
method <- 3
else
stop("invalid method found: ",method)
}else
return(fpSimOrig(x,y,sorted=sorted,method=method,addone=addone,cutoff=cutoff,top=top,
alpha=alpha, beta=beta))
#stop("invalid method type found: ",class(method))
if(!any(c(is.vector(x),inherits(x,"FP") , inherits(x,"FPset") & length(x)==1)))
stop("x needs to be object of class FP, FPset of length one, or vector")
if(!any(c(is.vector(y), is.matrix(y),inherits(y,"FP") ,inherits(y,"FPset") )) )
stop("y needs to be object of class FP/FPset, vector or matrix")
## Convert FP/FPset inputs into vector/matrix format
if(inherits(x,"FP"))
x <- as.numeric(x)
else if(inherits(x,"FPset"))
x <- as.numeric(x[[1]])
if(inherits(y,"FP")) {
dbSize = 1
y <- as.numeric(y)
}
else if(inherits(y,"FPset")){
dbSize=length(y)
y <- as.matrix(y)
}
else if(is.vector(y))
dbSize=1
else if(is.matrix(y))
dbSize=nrow(y)
if(is.vector(y)) y <- t(as.matrix(y))
result=.Call("similarity",x,y,method,addone,alpha,beta)
names(result) = rownames(y)
if(!is.null(parameters)){
numBitsSet = sum(x)+1
#N = parameters$count[numBitsSet]
#if(N==0){ # no stats collected for this number of bits so use global values
if(parameters$count[numBitsSet]==0){ # no stats collected for this number of bits so use global values
warning("no parameters avaliable for fingerprints with ",numBitsSet-1," bits set, using global parameters")
numBitsSet=nrow(parameters) #global stats are last element of parameters
#N = parameters$count[numBitsSet]
}
#message("using stats for ",numBitsSet-1," bits")
#print(parameters[numBitsSet,])
avg = parameters$avg[numBitsSet]
varience = parameters$varience[numBitsSet]
alpha = parameters$alpha[numBitsSet]
beta = parameters$beta[numBitsSet]
evalues = dbSize*(1-pbeta(result,alpha,beta))
# scores <<- data.frame(similarity=result, # ThG: replaced by next line on Dec 31, 2015 since build of R Markdown vignette returned error with '<<-'.
scores = data.frame(similarity=result,
zscore=(result - avg) /sqrt(varience),
evalue=evalues,
pvalue=1-exp(-evalues))
titles = 1:4
names(titles)=colnames(scores)
if(sorted){
if(titles[scoreType] <=2 ) #similarity or zscore, bigger is better
scores = scores[order(-scores[,titles[scoreType]]),]
else
scores = scores[order(scores[,titles[scoreType]]),]
if(top!="all")
scores = scores[1:top,]
}
if(!missing(cutoff)){
if(titles[scoreType] <=2 ) #similarity or zscore, bigger is better
cutScores = scores[scores[,titles[scoreType]]>= cutoff,]
else # e or p values, lower is better
cutScores = scores[scores[,titles[scoreType]]<= cutoff,]
# make sure we don't lose all results do to cutoff
if( nrow(cutScores) >= 1)
scores = cutScores
}
scores
}
else{
if(sorted) {
result = sort(result, decreasing=TRUE)
if(top!="all")
result = result[1:top]
}
cutResult = result[result >= cutoff]
if( length(cutResult) >= 1)
cutResult
else # make sure we don't lose all results do to cutoff
result
}
}
sdf2image <- function(sdf,filename,format="SVG",
height=300,
noHbonds=TRUE,
regenCoords=FALSE,
outputOptions=c()){
.ensureOB()
if(! inherits(sdf,"SDFset") ){
stop('reference compound must be a compound of class \"SDFset\"')
}
genOps = data.frame(names=character(0),args=character(0))
outOps = data.frame(names=character(0),args=character(0))
if(height != 300){ #differs from default
#optName = if(format == "SVG") "P" else "p"
optName = "p"
outOps = rbind(outOps,data.frame(names=optName,args=as.character(height)))
}
if(noHbonds)
genOps = rbind(genOps,data.frame(names="d",args=""))
if(regenCoords)
genOps = rbind(genOps,data.frame(names="gen2D",args=""))
outOps= rbind(outOps,data.frame(names=outputOptions,args=rep("",length(outputOptions))))
defs = sdfSet2definition(sdf)
convertToImage("SDF",format,defs,filename,
options = genOps, out_options = outOps)
}
######################################
## Query ChemMine Web Tools Service ##
######################################
#.serverURL <- "http://chemmine.ucr.edu/ChemmineR/"
# perform sdf to smiles conversion
sdf2smilesOB <- function(sdf) {
if(! inherits(sdf,"SDFset")){
stop('reference compound must be a compound of class \"SDFset\"')
}
.ensureOB()
#sdfstrList=as(as(sdf,"SDFstr"),"list")
#defs = paste(Map(function(x) paste(x,collapse="\n"), sdfstrList),collapse="\n" )
defs = sdfSet2definition(sdf)
t=Reduce(rbind,strsplit(unlist(strsplit(convertFormat("SDF","SMI",defs),
"\n",fixed=TRUE)),
"\t",fixed=TRUE))
if(!is.matrix(t)){ # R rearranged our matrix because there was only one result
smiles=t[1]
names(smiles)=t[2]
}else{
smiles = t[,1]
names(smiles)= t[,2]
}
as(smiles, "SMIset")
}
sdf2smiles <- sdf2smilesOB
smiles2sdfOB <- function(smiles) {
if(!any(class(smiles) %in% c("character", "SMIset"))){
stop('input must be SMILES strings stored as \"SMIset\" or \"character\" object')
}
if(inherits(smiles,"SMIset")) smiles <- as.character(smiles)
.ensureOB()
sdf = definition2SDFset(convertFormat("SMI","SDF",paste(paste(smiles,names(smiles),sep="\t"), collapse="\n")))
cid(sdf)=sdfid(sdf)
sdf
}
smiles2sdf <- smiles2sdfOB
regenCoordsOB <- function(sdf){
applyOptions(sdf,data.frame(names="gen2D",args=""))
}
regenerateCoords <- regenCoordsOB
generate3DCoordsOB <- function(sdf){
applyOptions(sdf,data.frame(names="gen3D",args=""))
}
generate3DCoords <- generate3DCoordsOB
canonicalizeOB <- function(sdf){
applyOptions(sdf,data.frame(names="canonical",args=""))
}
canonicalize <- canonicalizeOB
canonicalNumberingOB <- function(sdf){
.ensureOB()
results=canonicalNumbering_OB(obmol(sdf))
names(results) = cid(sdf)
results
}
canonicalNumbering <- canonicalNumberingOB
applyOptions <- function(sdf,options){
.ensureOB()
if(inherits(sdf,"SDFset") || inherits(sdf,"SDF")){
defs = sdfSet2definition(sdf)
sdfNew = definition2SDFset(convertFormat("SDF","SDF",defs,options))
cid(sdfNew)=sdfid(sdf)
if(inherits(sdf,"SDF"))
sdfNew[[1]]
else
sdfNew
}else
stop("input to applyOptions must be a SDFset or SDF object. Found ",class(sdf))
}
times = new.env()
times$descT = 0
times$uniqueT = 0
genAPDescriptors <- function(sdf,uniquePairs=TRUE){
# t=Sys.time()
d=.Call("genAPDescriptor",sdf)
# times$descT <- times$descT + (Sys.time() - t)
if(uniquePairs){
# t=Sys.time()
d= .uniquifyAtomPairs(d)
# itimes$uniqueT <- times$uniqueT + (Sys.time() - t)
}
d
}
propOB <- function(sdfSet){
.ensureOB()
results = prop_OB(obmol(sdfSet))
rownames(results) = cid(sdfSet)
results
}
fingerprintOB <- function(sdfSet,fingerprintName){
.ensureOB()
x = fingerprint_OB(obmol(sdfSet),fingerprintName)
if(is.vector(x)) x= t(as.matrix(x))
fpset = new("FPset",fpma=x,
type=fingerprintName)
cid(fpset) = cid(sdfSet)
fpset
}
smartsSearchOB <- function(sdfset,smartsPattern,uniqueMatches=TRUE){
.ensureOB()
smartsSearch_OB(obmol(sdfset),smartsPattern,uniqueMatches)
}
exactMassOB <- function(sdfset){
.ensureOB()
exactMass_OB(obmol(sdfset))
}
#compounds should be items that can be passed into similarity
maximallyDissimilar <- function(compounds,n,similarity = cmp.similarity) {
dist = function(a,b) 1-similarity(a,b)
if(length(compounds) <= 2)
stop("maximallyDissimilar: compounds must have more than 2 elements")
#pick first selection at random
selected = sample(1:length(compounds),1)
#compute dist between compounds[-selected] and compounds[selected]
minDistsToSelected = sapply(1:length(compounds),
function(i) dist(compounds[[i]],compounds[[selected]]))
for(k in 2:n){
#find candidate farthest from those selected
newPoint = which.max(minDistsToSelected)
selected = c(selected,newPoint)
minDistsToSelected[newPoint] = 0
#update min distances
minDistsToSelected = sapply(1:length(compounds),
function(i)
if(minDistsToSelected[i]==0) 0
else min(minDistsToSelected[i],dist(compounds[i],compounds[newPoint])))
}
compounds[selected]
}
sdf2OBMol<- function(sdfSet){
.ensureOB()
defs = paste(Map(function(x) paste(x,collapse="\n"),
as(as(sdfSet,"SDFstr"),"list")),"\n",
sep="",collapse="" )
forEachMol("SDF",defs,identity,c)
}
largestComponent <- function(sdfSet){
smiles = as.character(sdf2smiles(sdfSet))
splits = strsplit(smiles,'.',fixed=TRUE)
largestComps = sapply(splits,
function(comps){
comps[sort.int(vapply(comps,function(x){nchar(x)},0),
decreasing=TRUE,
index.return=TRUE)$ix][1]
})
#print(largestComps)
compsSdf = smiles2sdf(largestComps)
cid(compsSdf) = cid(sdfSet)
compsSdf
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.