R/sim.R

Defines functions largestComponent sdf2OBMol maximallyDissimilar exactMassOB smartsSearchOB fingerprintOB propOB genAPDescriptors applyOptions canonicalNumberingOB canonicalizeOB generate3DCoordsOB regenCoordsOB smiles2sdfOB sdf2smilesOB sdf2image fpSim fpSimOrig fp2bit .ensureOB .haveOB .load.file.backed.desc .is.file.backed.desc .data.frame.to.str .read.one.sdf .postToHost .progress_bar .uniquifyAtomPairs .factor_to_vector .interpret_atom .explain db.explain .extract_single_sdf db.subset sdf.subset cmp.search cmp.parse cmp.parse1 cmp.similarity .cmp.similarity .onLoad

Documented in cmp.parse cmp.parse1 cmp.search cmp.similarity db.explain db.subset exactMassOB fingerprintOB fp2bit fpSim genAPDescriptors largestComponent maximallyDissimilar propOB sdf2smilesOB sdf.subset smartsSearchOB smiles2sdfOB

cstrsplitSym=NA
.onLoad <- function(libname,pkgname) {

	#message("libname: ",libname)
	#message("pkgname: ",pkgname)
	cstrsplitSym <<- getNativeSymbolInfo("cstrsplit")
}

.db.header.size <- 16
# supported elements
.elements <- list(
    R=0,
    H=1,
    He=2,
    Li=3,
    Be=4,
    B=5,
    C=6,
    N=7,
    O=8,
    F=9,
    Ne=10,
    Na=11,
    Mg=12,
    Al=13,
    Si=14,
    P=15,
    S=16,
    Cl=17,
    Ar=18,
    K=19,
    Ca=20,
    Sc=21,
    Ti=22,
    V=23,
    Cr=24,
    Mn=25,
    Fe=26,
    Co=27,
    Ni=28,
    Cu=29,
    Zn=30,
    Ga=31,
    Ge=32,
    As=33,
    Se=34,
    Br=35,
    Kr=36,
    Rb=37,
    Sr=38,
    Y=39,
    Zr=40,
    Nb=41,
    Mo=42,
    Tc=43,
    Ru=44,
    Rh=45,
    Pd=46,
    Ag=47,
    Cd=48,
    In=49,
    Sn=50,
    Sb=51,
    Te=52,
    I=53,
    Xe=54,
    Cs=55,
    Ba=56,
    La=57,
    Ce=58,
    Pr=59,
    Nd=60,
    Pm=61,
    Sm=62,
    Eu=63,
    Gd=64,
    Tb=65,
    Dy=66,
    Ho=67,
    Er=68,
    Tm=69,
    Yb=70,
    Lu=71,
    Hf=72,
    Ta=73,
    W=74,
    Re=75,
    Os=76,
    Ir=77,
    Pt=78,
    Au=79,
    Hg=80,
    Tl=81,
    Pb=82,
    Bi=83,
    Po=84,
    At=85,
    Rn=86,
    Fr=87,
    Ra=88,
    Ac=89,
    Th=90,
    Pa=91,
    U=92,
    Np=93,
    Pu=94,
    Am=95,
    Cm=96,
    Bk=97,
    Cf=98,
    Es=99,
    Fm=100,
    Md=101,
    No=102,
    Lr=103,
    Rf=104,
    Db=105,
    Sg=106,
    Bh=107,
    Hs=108,
    Mt=109,
    Ds=110,
    Rg=111)



# similarity model
# input: descriptors for two compounds. both are vectors
# output: similarity
.cmp.similarity <- function(a, b, mode=1, worst=0,
    dbcon.a=NULL, db.intsize.a=4,
    dbcon.b=NULL, db.intsize.b=4)
{
    ## ThG: added for compatability with new S4 classes APset/AP
    if(inherits(a,"APset")) { a <- ap(a)[[1]] }
    if(inherits(b,"APset")) { b <- ap(b)[[1]] }
    if(inherits(a,"AP")){ a <- ap(a) }
    if(inherits(b,"AP")){ b <- ap(b) }
    ## ThG: end of lines
    # check argument, make sure they are vectors and not lists
    if (is.character(a) && length(a) == 3 && a[[1]] == 'filedb:') 
        a <- .load.file.backed.desc(a, dbcon.a, db.intsize.a)
    if (is.character(b) && length(b) == 3 && b[[1]] == 'filedb:') 
        b <- .load.file.backed.desc(b, dbcon.b, db.intsize.b)
    if (is.null(b) || is.null(a))
        return(0)
    if (!is.numeric(b) || !is.numeric(a) ) {
        stop("Both arguments must be AP/APset objects or descriptors in the form of vectors.\n",
        "Did you use \"[]\" instead of \"[[]]\" to index the descriptor ",
        "database?")
    }

    if (mode == 1 && worst != 0) {
        # estimate upper bound
        i <- length(a)
        j <- length(b)
        u_bond <- min(i, j) / max(i, j)
        if (u_bond < worst)
            return(0)
    }
    if (mode == 0) {
        # if no mode is given, guess a mode based on compound size
        # difference 
        if (length(a) < .25 * length(b) || length(a) > 4 * length(b))
            mode <- 3
        else
            mode <- 1
    }
    if (mode == 1) 
        return (length(intersect(a, b)) / length(union(a, b)))
    else if (mode == 2)
        return (length(intersect(a, b)) / min(length(a), length(b)))
    else if (mode == 3) {
        s <- length(intersect(a, b)) / min(length(a), length(b))
        return (s^3)
    }
}
cmp.similarity <- function(a, b, mode=1, worst=0)
{
    .cmp.similarity(a, b, mode=mode, worst=worst)
}
cmp.parse1 <- function(filename) sdf2ap(read.SDFset(filename))[[1]]
cmp.parse <- function(filename) apset2descdb(sdf2ap(read.SDFset(filename)))


# search db for similar compounds. `db' gives the database returned by
# `cmp.parse' or `sdfs_to_desc'. `query' gives the descriptor of one compound,
# either returned by `cmp.parse1/sdf_to_desc' or a reference to one instance in
# db such as db$descdb[[123]] two types of cutoff: score cutoff (<=1) or count
# cutoff (>1)
cmp.search <- function(db, query, type=1, cutoff=0.5, return.score=FALSE, quiet=FALSE,
	mode=1, visualize=FALSE, visualize.browse=TRUE, visualize.query=NULL) 
{
    ## ThG: added for compatability with new S4 classes APset/AP
    ## Note: type argument was also added (has no impact on old list object). 
    dbtype <- as.character(class(db))
    if(dbtype=="APset") { db <- apset2descdb(db) }
    if(inherits(query,"APset")) query <- ap(query[[1]]) 
    if(inherits(query,"AP")) query <- ap(query) 
    ## ThG: end of lines
    if (.is.file.backed.desc(query))
        query <- .load.file.backed.desc(query)
    dbcon <- NULL
    intsize <- 4
    if (db$type == 'file-backed') {
        for (i in 1:length(db$descdb)) {
            if (.is.file.backed.desc(db$descdb[[i]])) {
                dbcon <- file(paste(db$descdb[[i]][[2]], '.cdb', sep=''), 'rb')
                seek(dbcon, 16)
                intsize <- readBin(dbcon, integer(), n=1, size=1)
                break
            }
        }
    }
    scores <- array()
    ids <- array()
    pos <- 1    # tail of entries
    size <- length(db$cids)
    perstep <- round(size / 25)
    for (i in 1:size) {
        if (!quiet && i %% perstep == 0) {
            steps <- i / perstep
            .progress_bar(paste(min(steps*4, 100), "%", collapse=""))
        }
        cmp <- db$descdb[[i]]
        if (db$type == 'file-backed' && .is.file.backed.desc(db$descdb[[i]]))
            cmp <- .load.file.backed.desc(cmp, dbcon, intsize)
        if (cutoff <= 1)
            score <- .cmp.similarity(cmp, query, mode=mode, worst=cutoff)
        else if (pos - 1 < cutoff)
            score <- .cmp.similarity(cmp, query, mode=mode, worst=0)
        else
            score <- .cmp.similarity(cmp, query, mode=mode,
                    worst=scores[[pos - 1]])
        if (cutoff <= 1) {
            if (score >= cutoff) {
                scores[[pos]] <- score
                ids[[pos]] <- i
                pos <- pos + 1
            }
        } else {
            # using naive insertion sort
            if (pos - 1 < cutoff)
                pos <- pos + 1
            else if (scores[[pos - 1]] >= score)
                next

            pos_ <- pos - 2
            while (pos_ > 0 && scores[[pos_]] < score) {
                scores[[pos_ + 1]] <- scores[[pos_]]
                ids[[pos_ + 1]] <- ids[[pos_]]
                pos_ <- pos_ - 1
            }
            pos_ <- pos_ + 1
            scores[[pos_]] <- score
            ids[[pos_]] <- i
        }
    }
    if (!is.null(dbcon)) close(dbcon)
    if (!quiet) cat("\n")
    if (cutoff <= 1) {
        order <- sort(scores, index=TRUE)[["ix"]]
        order <- order[length(order):1]
    } else
        order <- 1:length(scores)

    ## ThG: modified/added for compatability with new S4 classes APset/AP
    if (return.score & dbtype=="list") {
        return(data.frame(ids=ids[order], scores=scores[order]))
     }
    if (!return.score & dbtype=="list") {
        return(ids[order])
    }
    if (dbtype=="APset") {
	if(type==1) {
		return(ids[order])
	}
	if(type==2) {
		index <- scores[order]
		names(index) <- db$cids[ids[order]]
		return(index)
	}
	if(type==3) {
		return(data.frame(index=ids[order], cid=db$cids[ids[order]], scores=scores[order]))
	}
     }
    ## ThG: end of lines
}

# segment SDFs. given indices of selected compounds, return the concatenation
# of SDFs for them
sdf.subset <- function(db, cmps)
{
    if (!grep('://', db$source) && !file.exists(db$source)) {
        stop("cannot find the source SDF file for this database\n")
    }
    output <- ""
    for (i in cmps)
    {
        sdf <- .extract_single_sdf(db, i)
        output <- paste(output, sdf, sep="")
    }

    return(output)
}

## ThG: function is obsolete for APset class which works with standard R subsetting utilities.
db.subset <- function(db, cmps)
{
    return(list(descdb=db$descdb[cmps], cids=db$cids[cmps],
                sdfsegs=db$sdfsegs[cmps], source=db$source))
}

# helper function for subset_sdf. This function takes an index, and return the
# SDF correponding compound at that index
.extract_single_sdf <- function(db, index)
{
    if (index > length(db$sdfsegs))
        stop("Index out of range in .extract_single_sdf")

    skip <- db$sdfsegs[[index]]
    con <- file(db$source, open="r")
    if (skip > 0) 
        #scan(con, "raw", skip=skip-1, nlines=1, quiet=TRUE)
        readLines(con, skip)
    sdf <- .read.one.sdf(con, collapse=FALSE)

    # replace line 1 with compound index if no name is available
    if (length(grep("[a-zA-z0-9]", sdf[[1]])) == 0)
        sdf[[1]] <- paste("ChemmineR_Unnamed_Compound_", index, sep="")

    # close connection
    close(con)
    return(paste(sdf, sep="", collapse="\n"))
}

# explain an atom pair descriptor
db.explain <- function(desc)
{
    ## ThG: added for compatability with new S4 classes APset/AP ##
    if(inherits(desc,"APset")) desc <- ap(desc[[1]])
    if(inherits(desc,"AP")) desc <- ap(desc) 
    ## ThG: end of lines ##
    if (is.character(desc)) {
        if (.is.file.backed.desc(desc))
            desc <- .load.file.backed.desc(desc)
        else
            stop("The descriptor(s) to be explained is not valid.")
    }
    if (length(desc) == 1)
        return(.explain(desc))
    else {
        result <- c()
        for (i in 1:length(desc))
            result <- c(result, .explain(desc[[i]]))
        return(result)
    }
}

.explain <- function(desc)
{
    # numbers of appearance
    occurence <- desc %% 2^7 + 1
    desc <- desc %/% 2^7
    # extract left atom, distance, and right atom
    left <- desc %/% 2^20
    desc <- desc %% 2^20
    dist <- desc %/% 2^13
    desc <- desc %% 2^13
    right <- desc
    # atom properties
    left <- .interpret_atom(left)
    right <- .interpret_atom(right)

    # output
    return (paste(left$symbol, " [", left$neighbors, " neighbor(s),",
                left$pi_electrons, " pi electrons]<---",
                dist,
                "--->", right$symbol, " [", right$neighbors,
                " neighbor(s),", right$pi_electrons, " pi electrons]",
                collapse="", sep=""))
}

# interpret an atom descriptor
# see also: .gen_atom_desc
.interpret_atom <- function(atom_desc)
{
    # atom
    atom_symbol <- names(.elements[.elements == atom_desc %/% 2^6])
    # neighbors
    neighbors <- atom_desc %% 2^6 %/% 2^3
    # pi electron
    pi_electrons <- atom_desc %% 2^3
    return(list(symbol=atom_symbol, neighbors=neighbors,
                pi_electrons=pi_electrons))
}

# convert a factor to a vector to simplify Union and Intersection operations.
# It basically eliminate repeated names by adding suffix. For example, if `a'
# appears twice in the factor, it will be converted to `a1' and `a2'. 
# this is an internal function and is not intended to be used by users
.factor_to_vector <- function(fac)
{
    vec <- vector(mode='numeric')
    index <- 1
    freq <- table(fac)
    keys <- names(freq)
    for (i in 1:length(keys)) {
        key <- keys[[i]]
        if (freq[[key]] == 1) {
            vec[[index]] <- as.numeric(key) * 2^7
            index <- index + 1
        } else {
            for (j in 1:freq[[key]]) {
                vec[[index]] <- as.numeric(key) * 2^7 + j - 1
                index <- index + 1
            }
        }
    }

    return(vec)
}

.uniquifyAtomPairs <- function(desc) {
	.Call("uniquifyAtomPairs",desc)
}

.progress_bar_int_cnt <- 0
.progress_bar <- function(label=NULL) {
    progress <- c("|", "/", "-", "\\")
    unlockBinding(".progress_bar_int_cnt", environment(.progress_bar))
    .progress_bar_int_cnt <<- .progress_bar_int_cnt %% 4 + 1
    if (is.null(label)) {
        cat("\r", progress[[.progress_bar_int_cnt]])
    } else {
        cat("\r                                           \r")
        cat(progress[[.progress_bar_int_cnt]], label)
    }
}

.postToHost <- function(host, path, data.to.send, port=80)
{
    if(missing(path)) path <- "/"
    ua <- "User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.1)"


    boundary <- paste(
            as.integer(runif(20, min=1, max=10)), collapse=""
    )

    header <- NULL
    header <- c(header, paste("POST ", path, " HTTP/1.1", sep=""))
    header <- c(header, paste("Host: ", host, sep=""))
    header <- c(header, "Connection: close")
    header <- c(header, ua)
    header <- c(header, "Accept: */*")
    header <- c(header,
            paste("Content-type: multipart/form-data; boundary=",
                boundary, sep="")
    )
    boundary <- paste('--', boundary, sep='')

    mcontent <- NULL # keeps the content.
    size <- 0

    for (i in 1:length(data.to.send)) {
        value <- data.to.send[[i]]
        key <- names(data.to.send)[i]

        content <- paste(
            boundary, "\n",
            "Content-Disposition: form-data; name=\"",
            as.character(key), "\"", "\n",
            "\n",
            as.character(value), "\n",
            sep=""
        )
        mcontent <- c(mcontent, charToRaw(content))
    }

    size <- length(mcontent) + length(strsplit(boundary, "")[[1]]) + 4
    header <- c(header, paste("Content-length: ", size, "\n\n", sep=""))

    postdata <- c(
        charToRaw(paste(header, collapse="\n")),
        mcontent,
        charToRaw(paste(boundary,"--\n\n",sep=""))
    )

    rm(header,mcontent)

    scon <- socketConnection(host=host,port=port,open="a+b",blocking=TRUE)
    writeBin(postdata, scon, size=1)

    output <- character(0)
    repeat{
        ss <- rawToChar(readBin(scon, "raw", 2048))
        output <- paste(output,ss,sep="")
        if(regexpr("\r\n0\r\n\r\n",ss)>-1) break()
        if(ss == "") break()
    }
    close(scon)
    return(output)
}

.read.one.sdf <- function(filename, collapse=TRUE)
{
    if (length(filename) != 1)
        stop('reference sdf must be a vector of only one entry!')

    if (!("connection" %in% class(filename)) && !is.character(filename))
        stop('reference sdf must be an SDF file or one character string of sdf')

    if ("connection" %in% class(filename))
        con <- filename
    # is filename really a filename a the SDF content? check existence of \n
    else if (length(grep("\n", filename)) == 0 &&
        length(grep("\r", filename)) == 0)
        con <- file(filename, open="r")
    else
        con <- textConnection(filename)

    result <- c()
    while (TRUE) {
        line <- readLines(con=con, n=1)
        if (length(line) == 0 || "$$$$" %in% line) {
            if (length(result) == 0)
                stop("empty SDF!")
            result <- c(result, '$$$$\n')
        if (collapse)
            return (paste(result, collapse="\n"))
        else
            return (result)
        } else
            result <- c(result, line)
    }
}

.data.frame.to.str <- function(x)
{
    con <- textConnection("string", "w")
    write.table(format(x), con, row.names=F, quote=F)
    close(con)
    return(paste(string, collapse='\n'))
} 

.is.file.backed.desc <- function(desc)
{
     is.character(desc) && length(desc) == 3 && desc[[1]] == 'filedb:'
}

.load.file.backed.desc <- function(desc, dbcon=NULL, intsize=4)
{
    dbname <- desc[[2]]
    pos <- desc[[3]]
    closeme <- FALSE
    if (is.null(dbcon)) {
        db_a <- paste(dbname, '.cdb', sep='')
        if (!file.exists(db_a)) stop("cannot find database file ", db_a)
        dbcon <- file(db_a, 'rb')
        closeme <- TRUE
        seek(dbcon, .db.header.size)
        intsize <- readBin(dbcon, integer(), n=1, size=1)
    }
    seek(dbcon, as.integer(pos))
    size <- readBin(dbcon, integer(), n=1, size=intsize)
    desc <- readBin(dbcon, integer(), n=size, size=intsize)
    if (closeme) close(dbcon)
    .factor_to_vector(as.factor(desc))
}

.haveOB <- function()
{
	if(suppressWarnings(require('ChemmineOB', quietly=T))
		&& ChemmineOB:::.supportedPlatform()) {
		return(TRUE)
   }else{
		return(FALSE)
	}

}
.ensureOB <- function(mesg = paste("ChemmineOB is required to make use of this function.",
										 "This package can be installed from BioConductor with the ",
										 "command 'BiocManager::install(\"ChemmineOB\"). ",
										 "It is not currently available for windows however.",
										 "See http://bioconductor.org/packages/devel/bioc/html/ChemmineOB.html",
										 "for more information"))
{
	if(!.haveOB())
		stop(mesg)
}


##############################################
## PubChem Fingerprint Similarity Searching ##
##############################################

## Convert base 64 encoded PubChem fingerprints to binary matrix or string vector
## Definition of PubChem's substructure dictionary-based fingerprints:
## ftp://ftp.ncbi.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
## For debugging use command: as.integer(rev(intToBits("19")[1:6])) 

## Load PubChem's substructure dictionary from data dir of library 
data(pubchemFPencoding); pubchemFPencoding <- pubchemFPencoding 
fp2bit <- function(x, type=3, fptag="PUBCHEM_CACTVS_SUBSKEYS") {
	## Covert base 64 strings to matrix
	if(inherits(x,"SDFset")) {
		blockmatrix <- datablock2ma(datablocklist=datablock(x))
		fp <- blockmatrix[, fptag]
	}
	if(is.matrix(x)) { 
		blockmatrix <- x
		fp <- blockmatrix[, fptag]
	}
	if(is.character(x)) { 
		fp <- x
	}
	fpma <- unlist(strsplit(fp, ""))
	fpma <- matrix(fpma, length(fp), nchar(fp[1]), byrow=TRUE)
	fpma <- fpma[, 1:154] # remove padding signs '='

	## base 64 decoding (base 64 alphabet from http://www.faqs.org/rfcs/rfc3548.html)
	base64 <- c(A=0,B=1,C=2,D=3,E=4,F=5,G=6,H=7,I=8,J=9,K=10,L=11,M=12,N=13,O=14,P=15,
                    Q=16,R=17,S=18,T=19,U=20,V=21,W=22,X=23,Y=24,Z=25,a=26,b=27,c=28,d=29,
                    e=30,f=31,g=32,h=33,i=34,j=35,k=36,l=37,m=38,n=39,o=40,p=41,q=42,r=43,
                    s=44,t=45,u=46,v=47,w=48,x=49,y=50,z=51,"0"=52,"1"=53,"2"=54,"3"=55,
                    "4"=56,"5"=57,"6"=58,"7"=59,"8"=60,"9"=61,"+"=62,"/"=63)
	fpbitma <- as.integer(intToBits(base64[as.vector(t(fpma))]))
	fpbitma <- matrix(fpbitma, length(fpma[,1])*154, 32, byrow=TRUE)[,6:1]
	fpbitma <- matrix(t(fpbitma), length(fpma[,1]), 6*154, byrow=TRUE)[,33:913]
        pubchemFP <- pubchemFPencoding[,2]
	names(pubchemFP) <- pubchemFPencoding[,1]
	colnames(fpbitma) <- pubchemFP
	rownames(fpbitma) <- names(fp)
	if(type==1) {
                return(apply(fpbitma, 1, paste, collapse=""))  
        }
	if(type==2) {
                return(fpbitma)
        }
	if(type==3) {
                #return(as(fpbitma, "FPset"))
                return(new("FPset",fpma=fpbitma,type="pubchem"))
        }
}

## Fingerprint comparison and similarity search function 
fpSimOrig <- function(x, y, sorted=TRUE, method="Tanimoto", addone=1, cutoff=0, top="all", alpha=1, beta=1, ...) {
	## Predefined similarity methods
	if(is.character(method)) {
	 	if(method=="Tanimoto" | method=="tanimoto") 
			method <- function(a,b,c,d) (c+addone)/(a+b+c+addone)
		else if(method=="Euclidean" | method=="euclidean") 
			method <- function(a,b,c,d) sqrt((c+d+addone)/(a+b+c+d+addone))
		else if(method=="Tversky" | method=="tversky") 
			method <- function(a,b,c,d) (c+addone)/(alpha*a + beta*b+c+addone)
		else if(method=="Dice" | method=="dice") 
			method <- function(a,b,c,d) (2*c+addone)/(a+c+b+c+addone)
	}
	## Check for valid inputs
	if(!any(c(is.vector(x), inherits(x,"FP"), inherits(x,"FPset") & length(x)==1))) stop("x needs to be object of class FP, FPset of length one, or vector")
   if(!any(c(is.vector(y), is.matrix(y), inherits(y,"FP"), inherits(y,"FPset") ))) stop("y needs to be object of class FP/FPset, vector or matrix")
	## Convert FP/FPset inputs into vector/matrix format
	if(inherits(x,"FP")) x <- as.numeric(x)
	if(inherits(x,"FPset")) x <- as.numeric(x[[1]])
	if(inherits(y,"FP")) y <- as.numeric(y)
	if(inherits(y,"FPset")) y <- as.matrix(y)
        ## Computate similarities
	if(is.matrix(y)) {
		c <- colSums((t(y) + x) == 2)
		d <- colSums((t(y) + x) == 0)
		b <- rowSums(y) - c
	} else {
		c <- sum(x + y == 2)
		d <- sum(x + y == 0)
		b <- sum(y) - c
	}
	a <- sum(x) - c
	if(sorted==TRUE) {
                tmp <- rev(sort(method(a,b,c,d)))
                if(top!="all") tmp <- tmp[1:top]
		if(top!=1) { 
			return(tmp[tmp>=cutoff])
		} else {
			return(tmp) # returns always a value when top=1; required for compatibility with cmp.cluster
		}
	}
	if(sorted==FALSE) {
		tmp <- method(a,b,c,d)
		if(top!=1) {
                	return(tmp[tmp>=cutoff])
		} else {
			return(tmp) # returns always a value when top=1; required for compatibility with cmp.cluster
		}
	}
}

fpSim <- function(x, y, sorted=TRUE, method="Tanimoto", 
						addone=1, cutoff=0, top="all", alpha=1, beta=1,
						parameters=NULL,scoreType="similarity") {

	if(is.character(method)) {
	 	if(method=="Tanimoto" | method=="tanimoto") 
			method <- 0
		else if(method=="Euclidean" | method=="euclidean") 
			method <- 1
		else if(method=="Tversky" | method=="tversky") 
			method <- 2
		else if(method=="Dice" | method=="dice") 
			method <- 3
		else 
			stop("invalid method found: ",method)
	}else
		return(fpSimOrig(x,y,sorted=sorted,method=method,addone=addone,cutoff=cutoff,top=top,
							  alpha=alpha, beta=beta))
		#stop("invalid method type found: ",class(method))


	if(!any(c(is.vector(x),inherits(x,"FP") , inherits(x,"FPset") & length(x)==1))) 
		stop("x needs to be object of class FP, FPset of length one, or vector")
   if(!any(c(is.vector(y), is.matrix(y),inherits(y,"FP") ,inherits(y,"FPset") )) )
		stop("y needs to be object of class FP/FPset, vector or matrix")

	## Convert FP/FPset inputs into vector/matrix format
	if(inherits(x,"FP")) 
		x <- as.numeric(x)
	else if(inherits(x,"FPset")) 
		x <- as.numeric(x[[1]])

	if(inherits(y,"FP")) {
		dbSize = 1
		y <- as.numeric(y)
	}
	else if(inherits(y,"FPset")){
		 dbSize=length(y)
		 y <- as.matrix(y)
	}
	else if(is.vector(y))
		dbSize=1
	else if(is.matrix(y))
		dbSize=nrow(y)

	if(is.vector(y)) y <- t(as.matrix(y))
   

	result=.Call("similarity",x,y,method,addone,alpha,beta)
	names(result) = rownames(y)

	if(!is.null(parameters)){
		numBitsSet = sum(x)+1

		#N = parameters$count[numBitsSet]
		#if(N==0){ # no stats collected for this number of bits so use global values
		if(parameters$count[numBitsSet]==0){ # no stats collected for this number of bits so use global values
			warning("no parameters avaliable for fingerprints with ",numBitsSet-1," bits set, using global parameters")
			numBitsSet=nrow(parameters) #global stats are last element of parameters
			#N = parameters$count[numBitsSet]
		}

		#message("using stats for ",numBitsSet-1," bits")
		#print(parameters[numBitsSet,])
		
		avg = parameters$avg[numBitsSet]
		varience = parameters$varience[numBitsSet]
		alpha = parameters$alpha[numBitsSet]
		beta = parameters$beta[numBitsSet]

		evalues = dbSize*(1-pbeta(result,alpha,beta))
		# scores <<- data.frame(similarity=result, # ThG: replaced by next line on Dec 31, 2015 since build of R Markdown vignette returned error with '<<-'.
		scores = data.frame(similarity=result,
									 zscore=(result - avg) /sqrt(varience),
									 evalue=evalues,
									 pvalue=1-exp(-evalues))
		
		titles = 1:4
		names(titles)=colnames(scores)
		if(sorted){
			if(titles[scoreType] <=2 ) #similarity or zscore, bigger is better
				scores = scores[order(-scores[,titles[scoreType]]),]
			else
				scores = scores[order(scores[,titles[scoreType]]),]
			if(top!="all")
				scores = scores[1:top,]
		}

		if(!missing(cutoff)){
			if(titles[scoreType] <=2 ) #similarity or zscore, bigger is better
				cutScores = scores[scores[,titles[scoreType]]>= cutoff,]
			else   # e or p values, lower is better
				cutScores = scores[scores[,titles[scoreType]]<= cutoff,]

			# make sure we don't lose all results do to cutoff
			if( nrow(cutScores) >= 1)
				scores = cutScores
		}
		scores
	}
	else{

		if(sorted) {
			result = sort(result, decreasing=TRUE)
			if(top!="all")
				result = result[1:top]
		}
		cutResult = result[result >= cutoff]
		if( length(cutResult) >= 1)
			cutResult
		else # make sure we don't lose all results do to cutoff
			result
	}
	

}


sdf2image <- function(sdf,filename,format="SVG",
							 height=300,
							 noHbonds=TRUE,
							 regenCoords=FALSE,
							 outputOptions=c()){
	 .ensureOB()

    if(! inherits(sdf,"SDFset") ){
        stop('reference compound must be a compound of class \"SDFset\"')
    } 

	 genOps = data.frame(names=character(0),args=character(0))
	 outOps = data.frame(names=character(0),args=character(0))

	 if(height != 300){ #differs from default
		 #optName = if(format == "SVG") "P" else "p"
		 optName = "p"
		 outOps = rbind(outOps,data.frame(names=optName,args=as.character(height)))
	 }
	 if(noHbonds)
		 genOps = rbind(genOps,data.frame(names="d",args=""))
	 if(regenCoords)
		 genOps = rbind(genOps,data.frame(names="gen2D",args=""))

	 outOps= rbind(outOps,data.frame(names=outputOptions,args=rep("",length(outputOptions))))


	 defs = sdfSet2definition(sdf)
	 convertToImage("SDF",format,defs,filename,
						 options = genOps, out_options = outOps)

}

######################################
## Query ChemMine Web Tools Service ##
######################################
#.serverURL <- "http://chemmine.ucr.edu/ChemmineR/"

# perform sdf to smiles conversion
sdf2smilesOB <- function(sdf) {
    if(! inherits(sdf,"SDFset")){
        stop('reference compound must be a compound of class \"SDFset\"')
    } 

	 .ensureOB()
	 #sdfstrList=as(as(sdf,"SDFstr"),"list")
	 #defs = paste(Map(function(x) paste(x,collapse="\n"), sdfstrList),collapse="\n" )
	 defs = sdfSet2definition(sdf)
	 t=Reduce(rbind,strsplit(unlist(strsplit(convertFormat("SDF","SMI",defs),
														  "\n",fixed=TRUE)),
				 "\t",fixed=TRUE))
	 if(!is.matrix(t)){ # R rearranged our matrix because there was only one result
		 smiles=t[1]
		 names(smiles)=t[2]
	 }else{
		 smiles = t[,1]
		 names(smiles)= t[,2]
	 }
	 as(smiles, "SMIset")
}
sdf2smiles <- sdf2smilesOB



smiles2sdfOB <- function(smiles) {
    if(!any(class(smiles) %in% c("character", "SMIset"))){
        stop('input must be SMILES strings stored as \"SMIset\" or \"character\" object')
    }
	 if(inherits(smiles,"SMIset")) smiles <- as.character(smiles)
	 .ensureOB()
	 sdf = definition2SDFset(convertFormat("SMI","SDF",paste(paste(smiles,names(smiles),sep="\t"), collapse="\n")))
	 cid(sdf)=sdfid(sdf)
	 sdf
}
smiles2sdf <- smiles2sdfOB

regenCoordsOB <- function(sdf){
	applyOptions(sdf,data.frame(names="gen2D",args=""))
}
regenerateCoords <- regenCoordsOB

generate3DCoordsOB <- function(sdf){
	applyOptions(sdf,data.frame(names="gen3D",args=""))
}
generate3DCoords <- generate3DCoordsOB

canonicalizeOB <- function(sdf){
	applyOptions(sdf,data.frame(names="canonical",args=""))
}
canonicalize <- canonicalizeOB

canonicalNumberingOB <- function(sdf){
	.ensureOB()
	results=canonicalNumbering_OB(obmol(sdf))
	names(results) = cid(sdf)
	results
}
canonicalNumbering <- canonicalNumberingOB

applyOptions <- function(sdf,options){
	.ensureOB()

	if(inherits(sdf,"SDFset") || inherits(sdf,"SDF")){
		defs = sdfSet2definition(sdf)
		sdfNew = definition2SDFset(convertFormat("SDF","SDF",defs,options))
		cid(sdfNew)=sdfid(sdf)
		if(inherits(sdf,"SDF"))
			sdfNew[[1]]
		else
			sdfNew
	}else
		stop("input to applyOptions must be a SDFset or SDF object. Found ",class(sdf))
}

times = new.env()
times$descT = 0
times$uniqueT = 0
genAPDescriptors <- function(sdf,uniquePairs=TRUE){

	# t=Sys.time()
	d=.Call("genAPDescriptor",sdf)
	# times$descT <- times$descT + (Sys.time() - t)

	if(uniquePairs){
		# t=Sys.time()
		d= .uniquifyAtomPairs(d)
		# itimes$uniqueT <- times$uniqueT + (Sys.time() - t)
	}

	d
}

propOB <- function(sdfSet){
	.ensureOB()
	results = prop_OB(obmol(sdfSet))
	rownames(results) = cid(sdfSet)
	results
}

fingerprintOB <- function(sdfSet,fingerprintName){
	.ensureOB()
   x = fingerprint_OB(obmol(sdfSet),fingerprintName)
	if(is.vector(x)) x= t(as.matrix(x))

	fpset = new("FPset",fpma=x,
					type=fingerprintName)
	cid(fpset) = cid(sdfSet)
	fpset
}
smartsSearchOB <- function(sdfset,smartsPattern,uniqueMatches=TRUE){
	.ensureOB()
	smartsSearch_OB(obmol(sdfset),smartsPattern,uniqueMatches)
}
exactMassOB <- function(sdfset){
	.ensureOB()
	exactMass_OB(obmol(sdfset))
}


#compounds should be items that can be passed into similarity
maximallyDissimilar <- function(compounds,n,similarity = cmp.similarity) {
	dist = function(a,b) 1-similarity(a,b)
	if(length(compounds) <= 2)
		stop("maximallyDissimilar: compounds must have more than 2 elements")

	#pick first selection at random
	selected = sample(1:length(compounds),1)

	#compute dist between compounds[-selected] and compounds[selected]
	minDistsToSelected = sapply(1:length(compounds),
										 function(i) dist(compounds[[i]],compounds[[selected]]))

	for(k in 2:n){
		#find candidate farthest from those selected
		newPoint = which.max(minDistsToSelected)
		selected = c(selected,newPoint)
		minDistsToSelected[newPoint] = 0

		#update min distances
		minDistsToSelected = sapply(1:length(compounds),
											 function(i) 
												 if(minDistsToSelected[i]==0) 0 
												 else min(minDistsToSelected[i],dist(compounds[i],compounds[newPoint])))
	}
	
	compounds[selected]
}

sdf2OBMol<- function(sdfSet){
	.ensureOB()
	defs = paste(Map(function(x) paste(x,collapse="\n"),
						  as(as(sdfSet,"SDFstr"),"list")),"\n",
					 sep="",collapse="" )

	forEachMol("SDF",defs,identity,c)
}

largestComponent <- function(sdfSet){
	smiles = as.character(sdf2smiles(sdfSet))
	splits = strsplit(smiles,'.',fixed=TRUE)
	largestComps = sapply(splits,
			 function(comps){  
				 comps[sort.int(vapply(comps,function(x){nchar(x)},0),
									 decreasing=TRUE,
									 index.return=TRUE)$ix][1] 
			 })
	#print(largestComps)
	compsSdf = smiles2sdf(largestComps)
	cid(compsSdf) = cid(sdfSet)
	compsSdf
}
girke-lab/ChemmineR documentation built on July 28, 2023, 10:36 a.m.