################################################
## Class and Method Definitions for ChemmineR ##
################################################
## SDF format definition:
# http://www.epa.gov/ncct/dsstox/MoreonSDF.html
# http://www.symyx.com/downloads/public/ctfile/ctfile.jsp
#################################################
## (1) Class and Method Definitions for SDFstr ##
#################################################
## Download PubChem SDF samples for code testing
## This function is not intended to be used by users.
.sdfDownload <- function(mypath="ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/", myfile="Compound_00650001_00675000.sdf.gz") {
system(paste("wget ", mypath, myfile, sep=""))
system(paste("gunzip ", myfile))
}
# .sdfDownload(mypath="ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/", myfile="Compound_00650001_00675000.sdf.gz")
## Import SD File and Return SDFstr Class (list-like)
read.SDFstr <- function(sdfstr) {
if(length(sdfstr) > 1) { # Support for passing on SD File content as character vector
mysdf <- sdfstr
} else {
mysdf <- readLines(sdfstr) # Reads file line-wise into vector
}
y <- regexpr("^\\${4,4}", mysdf, perl=TRUE) # identifies all fields that start with a '$$$$' sign
index <- which(y!=-1)
#if the last non-empty line of mysdf is not "$$$$". Only search from the last "$$$$" found
# to the end of the file
start = if(length(index)==0) 1 else index[length(index)]
if("$$$$" != Find(function(line) line !="", mysdf[start:length(mysdf)]
,right=TRUE)){
# we have a MOL file, so just insert the $$$$ at the end
mysdf <- c(mysdf,"$$$$")
index <- c(index,length(mysdf))
}
indexDF <- data.frame(start=c(1, index[-length(index)]+1), end=index)
mysdf_list <- lapply(seq(along=indexDF[,1]), function(x) mysdf[seq(indexDF[x,1], indexDF[x,2])])
if(class(mysdf_list) != "list") { mysdf_list <- list(as.vector(mysdf_list)) }
names(mysdf_list) <- 1:length(mysdf_list)
mysdf_list <- new("SDFstr", a=mysdf_list)
return(mysdf_list)
}
## Define SDFstr class
setClass("SDFstr", representation(a = "list"))
## Accessor method for SDFstr
setGeneric(name="sdfstr2list", def=function(x) standardGeneric("sdfstr2list"))
setMethod(f="sdfstr2list", signature="SDFstr", definition=function(x) {return(x@a)})
## Replacement method for SDFstr using accessor method
setGeneric(name="sdfstr2list<-", def=function(x, value) standardGeneric("sdfstr2list<-"))
setReplaceMethod(f="sdfstr2list", signature="SDFstr", definition=function(x, value) {
x@a <- value
return(x)
})
## Replacement method for SDFstr using "[" operator
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SDFstr", definition=function(x, i, j, value) {
x@a[i] <- sdfstr2list(value)
return(x)
})
## Define behavior of "[" operator for SDFstr
setMethod(f="[", signature="SDFstr", definition=function(x, i, ..., drop) {
x@a <- x@a[i]
return(x)
})
## Behavior of "[[" operator to convert single SDFstr component to character vector
setMethod(f="[[", signature="SDFstr",
definition=function(x, i, ..., drop) {
return(x@a[[i]])
})
## Replacement method for SDFstr using "[" operator
setReplaceMethod(f="[", signature="SDFstr", definition=function(x, i, value) {
x@a[i] <- value
return(x)
})
## Replacement method for SDFstr using "[[" operator
setReplaceMethod(f="[[", signature="SDFstr", definition=function(x, i, value) {
x@a[[i]] <- value
return(x)
})
## Define print behavior for SDFstr
setMethod(f="show", signature="SDFstr",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\"", " with ", length(sdfstr2list(object)), " molecules", "\n", sep="")
})
## Length function
setMethod(f="length", signature="SDFstr",
definition=function(x) {
return(length(sdfstr2list(x)))
})
## Write SDF/SDFstr/SDFset Objects to SD File
write.SDF <- function(sdf, file, cid=FALSE, ...) {
if(class(sdf)=="SDF") sdfstr <- as(sdf, "SDFstr")
if(class(sdf)=="SDFstr") sdfstr <- sdf
if(class(sdf)=="SDFset") {
if(cid==TRUE) { sdflist <- lapply(cid(sdf), function(x) sdf2str(sdf=sdf[[x]], cid=x, ...)) }
if(cid==FALSE) { sdflist <- lapply(cid(sdf), function(x) sdf2str(sdf=sdf[[x]], ...)) }
sdfstr <- as(sdflist, "SDFstr")
}
cat(unlist(sdfstr2list(sdfstr)), sep="\n", file=file)
}
## Coerce Methods for SDFstr Class
## SDFstr to list
setAs(from="SDFstr", to="list",
def=function(from) {
sdfstr2list(from)
})
## List to SDFstr
setAs(from="list", to="SDFstr",
def=function(from) {
new("SDFstr", a=from)
})
## List to SDFstr
setAs(from="character", to="SDFstr",
def=function(from) {
new("SDFstr", a=list(from))
})
################################################
## (2) Class and Method Definitions for SDF ##
################################################
setClass("ExternalReference")
setClassUnion("ExternalReferenceOrNULL",members=c("ExternalReference","NULL"))
setClass("SDF", representation(header="character", atomblock="matrix",
bondblock="matrix", datablock="character",
obmolRef="ExternalReferenceOrNULL",
version="character"),
prototype=list(obmolRef=NULL,version="V2000"))
## Convert SDFstr to SDF Class
## SDFstr Parser Function
.sdfParse <- function(sdf, datablock=TRUE, tail2vec=TRUE,extendedAttributes=FALSE) {
countpos <- grep("V\\d\\d\\d\\d$", sdf, perl=TRUE)
if(length(countpos)==0)
countpos <- grep("V {0,}\\d\\d\\d\\d$", sdf, perl=TRUE)
if(length(countpos)==0 || length(countpos)>1)
countpos <- 4
countline <- sdf[countpos]
if(length(grep("V3000$",countline))!=0) # we have a V3000 formatted file
return(.parseV3000(sdf,datablock,tail2vec,extendedAttributes))
if(nchar(gsub("\\d| ", "", substring(countline, 1, 6))) != 0)
countline <- " 0 0" # Create dummy countline if it contains non-numeric values
Natom <- as.numeric(substring(countline, 1, 3))
Nbond <- as.numeric(substring(countline, 4, 6))
start <- c(header=1, atom=countpos+1, bond=countpos+Natom+1, extradata=countpos+Natom+Nbond+1)
index <- cbind(start=start, end=c(start[2:3], start[4], length(sdf)+1)-1)
## Header block
header <- sdf[index["header",1]:index["header",2]]
if(length(header)==4) names(header) <- c("Molecule_Name", "Source", "Comment", "Counts_Line")
## Atom block
## format: x y z <atom symbol>
ab2matrix <- function(ct=sdf[index["atom",1]:index["atom",2]]) {
if((index["atom","end"] - index["atom","start"]) < 1) {
ctma <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
} else {
ct <- gsub("^ {1,}", "", ct)
ctlist <- strsplit(ct, " {1,}")
ctma <- tryCatch(matrix(unlist(ctlist),
ncol=length(ctlist[[1]]),
nrow=length(ct),
byrow=TRUE),
warning=function(w) NULL)
# If rows in atom block are of variable length, use alternative/slower approach
if(length(ctma)==0) {
maxcol <- max(sapply(ctlist, length))
ctma <- matrix(0, nrow=length(ct), ncol=maxcol)
for(i in seq(along=ctma[,1]))
ctma[i, 1:length(ctlist[[i]])] <- ctlist[[i]]
}
myrownames <- paste(ctma[,4], 1:length(ctma[,4]), sep="_")
# Ncol <- length(ctlist[[1]])
Ncol <- length(ctma[1, , drop = FALSE])
ctma <- matrix(as.numeric(ctma[,-4]),
nrow=length(ct),
ncol=Ncol-1,
dimnames=list(myrownames, paste("C", c(1:3, 5:Ncol), sep="")))
}
return(ctma)
}
atomblock <- ab2matrix(ct=sdf[index["atom",1]:index["atom",2]])
## Bond block
bb2matrix <- function(ct=sdf[index["bond",1]:index["bond",2]]) {
#if((index["bond","end"] - index["bond","start"]) < 1) {
if(((index["bond","end"] - index["bond","start"])+1) < 1) {
ctma <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
} else {
ct <- gsub("^(...)(...)(...)(...)(...)(...)(...)", "\\1 \\2 \\3 \\4 \\5 \\6 \\7", ct)
ct <- gsub("(^..\\d)(\\d)", "\\1 \\2", ct) # Splits bond strings where one or both of the atoms have 3 digit numbers
ct <- gsub("^ {1,}", "", ct)
ctlist <- strsplit(ct, " {1,}")
ctma <- matrix(unlist(ctlist), ncol=length(ctlist[[1]]), nrow=length(ct), byrow=TRUE)
Ncol <- length(ctlist[[1]])
ctma <- matrix(as.numeric(ctma), nrow=length(ct), ncol=Ncol, dimnames=list(1:length(ct), paste("C", 1:Ncol, sep="")))
}
return(ctma)
}
bondblock <- bb2matrix(ct=sdf[index["bond",1]:index["bond",2]])
if(tail2vec==TRUE) {
extradata <- ex2vec(extradata=sdf[index["extradata",1]:index["extradata",2]])
} else {
extradata <- sdf[index["extradata",1]:index["extradata",2]]
}
## Assemble components in object of class SDF
if(datablock==TRUE) {
sdf <- new("SDF", header=header, atomblock=atomblock, bondblock=bondblock, datablock=extradata)
} else {
sdf <- new("SDF", header=header, atomblock=atomblock, bondblock=bondblock)
}
return(sdf)
}
## SDF name/value block
ex2vec <- function(extradata) {
exstart <- grep("^>", extradata)
if(length(exstart)==0) {
exvec <- vector("character", length=0)
} else {
names(exstart) <- gsub("^>.*<|>", "", extradata[exstart])
exindex <- cbind(start=exstart, end=c(exstart[-1], length(extradata))-1) # Changed 'length(extradata)+1' to 'length(extradata)' on Mar 22, 2014
exvec <- sapply(rownames(exindex), function(x) paste(extradata[(exindex[x,1]+1):(exindex[x,2]-1)], collapse=" __ "))
}
return(exvec)
}
findPositions = function(sdf){
patterns = c("BEGIN CTAB","COUNTS","BEGIN ATOM","END ATOM",
"BEGIN BOND", "END BOND","M END")
tagPositions = lapply(patterns,function(pattern) grep(pattern,sdf,fixed=TRUE))
tagPositions[which(lapply(tagPositions,length)==0)] = -1 # ensure unlist does not remove undefined entries
tagPositions = unlist(tagPositions)
unfound = which(tagPositions == -1)
if(length(unfound) != 0){ # some tags are missing
warning("malformed SDF V3000 file, could not find tags ",
paste("\"",patterns[unfound],"\"",sep="",collapse=","))
}
tagPositions
}
.parseV3000 <- function(sdf, datablock=TRUE, tail2vec=TRUE,extendedAttributes=FALSE) {
#message("found V3000 formatted compound")
sdfLength = length(sdf)
tagPositions=findPositions(sdf)
ctabPos = tagPositions[1]
if(ctabPos == -1){
header = sdf[1:4]
}else
header = sdf[1:(ctabPos-1)]
if(length(header)==4)
names(header) <- c("Molecule_Name", "Source", "Comment", "Counts_Line")
countLinePos = tagPositions[2]
if(countLinePos != -1)
header["Counts_Line"] = sdf[countLinePos]
#message("parsing atomblock")
atomPos = tagPositions[3]
if(atomPos == -1){
warning("No ATOM block found in ",sdf[1]," returning a dummy atom block")
atomblock <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
}else{
#message("atomPos: ",atomPos," class: ",class(atomPos))
atomEndPos = tagPositions[4]
if(atomEndPos == -1){
warning("Could not find the end of the ATOM block in ",sdf[1])
atomEndPos = atomPos + 1 #assume and emtpy atom block
}
data = Reduce(rbind,Map(function(line) {
parts = cstrsplit(line)
attrs = if(length(parts) > 8)
paste(parts[9:length(parts)],collapse=" ")
else ""
c(parts[3:7],attrs)
}, sdf[(atomPos+1):(atomEndPos-1)])) #TODO: check for empty range
#if(extendedAttributes)
# extAtomAttrs = parseAttributes(data[,6])
extAtomAttrs = parseAttributes(data[,6])
# we need to tranlate certain "Standard" attribute values from v3k back
# into the v2k format to make things consistent
#
# <mass difference> MASS v2k: offset from pt v3k: actual value
# <charge> CHG v2k: value of 1-7 v3k: actual charge value
# <atom stereo parity> CFG
# <hydrogen count +1> HCOUNT
# <stereo care box> STBOX
# <valence> VAL v2k: 15 indicates 0 v3: -1 indicates 0
standardAttrs = matrix(0,nrow(data),7)
for(i in seq(along=extAtomAttrs)){ # for each atom
mass = extAtomAttrs[[i]]$MASS
if(!is.null(mass)){
massDiff = mass - AW[data[i,2]]
standardAttrs[i,1] = massDiff
}
chg= extAtomAttrs[[i]]$CHG
if(!is.null(chg)){
chg= 4 - as.numeric(chg)
if(chg == 4) # undo shift for 0 values
chg = 0
if(chg < 1 || chg > 7)
chg = 0 # chg not in [1,7] => 0
standardAttrs[i,2] = chg
}
cfg = extAtomAttrs[[i]]$CFG
if(!is.null(cfg)){
standardAttrs[i,3] = cfg
}
hcount = extAtomAttrs[[i]]$HCOUNT
if(!is.null(hcount)){
standardAttrs[i,4] = hcount
}
stbox = extAtomAttrs[[i]]$STBOX
if(!is.null(stbox)){
standardAttrs[i,5] = stbox
}
val = extAtomAttrs[[i]]$val
if(!is.null(val)){
if(val == -1) # means 0 in v3k
val = 15 # translate to 0 in v2k
standardAttrs[i,6] = val
}
}
#print(extAtomAttrs)
atomblock =cbind(data[,3:5],standardAttrs)
mode(atomblock)="numeric"
#print(atomblock)
colnames(atomblock) = paste("C",c(1:3,5:11),sep="")
rownames(atomblock) = paste(data[,2],data[,1],sep="_")
}
#message("parsing bond block")
bondPos = tagPositions[5]
if(bondPos == -1){
warning("No BOND block found in ",sdf[1]," returning a dummy bond block")
bondblock <- matrix(rep(0,2), 1, 2, dimnames=list("0", c("C1", "C2"))) # Creates dummy matrix in case there is none.
}else{
bondEndPos = tagPositions[6]
if(bondEndPos == -1){
warning("Could not find the end of the BOND block in ",sdf[1])
bondEndPos = bondPos + 1 #assume and emtpy atom block
}
data = Reduce(rbind,Map(function(line) {
#cstrsplit(line)[3:6]
parts = cstrsplit(line)
attrs= if(length(parts) > 6)
paste(parts[7:length(parts)],collapse=" ")
else ""
c(parts[3:6],attrs)
}, sdf[(bondPos+1):(bondEndPos-1)])) #TODO: check for empty range
extBondAttrs = parseAttributes(data[,5]) # +2.6s
# CFG bond configuration(stereo) change: 0 -> 0, 1-> 1, 2-> 4, 3->6
# empty slot
# TOPO same
# RXCTR same
standardAttrs = matrix(0,nrow(data),4) # +0.8s
for(i in seq(along=extBondAttrs)){ # +1.3s
cfg=extBondAttrs[[i]]$CFG
if(!is.null(cfg)){
#cfg = extBondAttrs[[i]]$CFG
if(cfg == 2) cfg = 4
else if(cfg == 3) cfg = 6
standardAttrs[i,1] = cfg
}
topo=extBondAttrs[[i]]$TOPO
if(!is.null(topo)){
standardAttrs[i,3] = topo
}
rxctr=extBondAttrs[[i]]$rxctr
if(!is.null(rxctr)){
standardAttrs[i,4] = rxctr
}
}
bondblock = cbind(data[,c(3,4,2)],standardAttrs) # +0.5s
#bondblock = data[,c(3,4,2)]
mode(bondblock)="numeric"
colnames(bondblock) = paste("C",1:7,sep="")
rownames(bondblock) = data[,1]
}
#message("parsing extra data")
endPos = tagPositions[7]
if(endPos == -1){
warning("no END tag found in ",sdf[1])
extradata = vector("character",length=0)
}else{
if(endPos+2 < sdfLength){
extradata = if(tail2vec==TRUE) ex2vec(sdf[(endPos+1):sdfLength])
else sdf[(endPos+1):sdfLength]
}
else
extradata = vector("character",length=0)
}
## Assemble components in object of class SDF
className = "SDF"
args = list( header=header, atomblock=atomblock, bondblock=bondblock,version="V3000")
if(datablock==TRUE)
args = c(args,datablock=list(extradata))
if(extendedAttributes==TRUE) {
className = "ExtSDF"
args = c(args,extendedAtomAttributes =list(extAtomAttrs),
extendedBondAttributes = list(extBondAttrs) )
}
#message("className: ",className," args: ")
#print(str(args))
sdf = do.call("new",c(className,args))
return(sdf)
}
trim <- function(str) gsub("^\\s+|\\s+$","",str)
parseAttributes <- function(attributes){
#message("attributes: ")
#print(attributes)
if(all(attributes=="")){
return(sapply(seq(along=attributes),function(i) list()))
}
starts = gregexpr("(^|\\s)[a-zA-Z0-9]+=",attributes)
sapply(seq(along=attributes),function(i) {
if(attributes[i]=="")
list()
else{
starts[[i]][length(starts[[i]])+1] = nchar(attributes[i])
# message("starts[[",i,"]]:")
# print(starts[[i]])
sapply(1:(length(starts[[i]])-1),function(j){
pair = unlist(strsplit(
substring(attributes[i],starts[[i]][j],starts[[i]][j+1])
,"=",fixed=TRUE) )
# print(pair)
if(length(pair) != 2)
warning("bad key value pair found:
",substring(attributes[i],starts[[i]][j],starts[[i]][j+1]))
l=list()
l[[trim(pair[1])]] = trim(pair[2])
l
})
}
})
}
attributesAsString<- function(attributes){
sapply(attributes,
function(x){
return(
if(length(x) > 0)
paste(apply(cbind(names(x),E="=",x),1,function(row) paste(row,collapse="") ),collapse=" ")
else "") })
}
## Accessor methods for SDF class
setGeneric(name="sdf2list", def=function(x) standardGeneric("sdf2list"))
setMethod(f="sdf2list", signature="SDF", definition=function(x) {return(list(header=header(x), atomblock=atomblock(x), bondblock=bondblock(x), datablock=datablock(x)))})
setGeneric(name="header", def=function(x) standardGeneric("header"))
setMethod(f="header", signature="SDF", definition=function(x) {return(x@header)})
setGeneric(name="sdfid", def=function(x, tag=1) standardGeneric("sdfid"))
setMethod(f="sdfid", signature="SDF", definition=function(x, tag=1) {return(x@header[tag])})
setGeneric(name="sdfid<-", def=function(x, value) standardGeneric("sdfid<-"))
setReplaceMethod(f="sdfid", signature="SDF", definition=function(x, value) {
x@header[1]=value
return(x)
})
setGeneric(name="atomblock", def=function(x) standardGeneric("atomblock"))
setMethod(f="atomblock", signature="SDF", definition=function(x) {return(x@atomblock)})
setGeneric(name="atomcount", def=function(x, addH=FALSE, ...) standardGeneric("atomcount"))
setMethod(f="atomcount", signature="SDF", definition=function(x, addH=FALSE, ...) {
if(addH==TRUE) {
return(table(c(gsub("_.*", "", rownames(x@atomblock)), rep("H", bonds(x, type="addNH")))))
} else {
return(table(gsub("_.*", "", rownames(x@atomblock))))
}
})
setGeneric(name="bondblock", def=function(x) standardGeneric("bondblock"))
setMethod(f="bondblock", signature="SDF", definition=function(x) {return(x@bondblock)})
setGeneric(name="datablock", def=function(x) standardGeneric("datablock"))
setMethod(f="datablock", signature="SDF", definition=function(x) {return(x@datablock)})
setGeneric(name="datablocktag", def=function(x, tag) standardGeneric("datablocktag"))
setMethod(f="datablocktag", signature="SDF", definition=function(x, tag) {return(x@datablock[tag])})
setGeneric(name="obmol",def=function(x) standardGeneric("obmol"))
setMethod(f="obmol",signature="SDF",definition= function(x) {
.ensureOB()
if(is.null(x@obmolRef))
x@obmolRef = sdf2OBMol(x)
x@obmolRef
})
## Define print behavior for SDF
setMethod(f="show", signature="SDF",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
cat("\n<<header>>", "\n", sep="")
print(header(object))
cat("\n<<atomblock>>", "\n", sep="")
if(length(atomblock(object)[,1])>=5) {
print(as.data.frame(rbind(atomblock(object)[1:2,], ...=rep("...", length(atomblock(object)[1,])),
atomblock(object)[(length(atomblock(object)[,1])-1):length(atomblock(object)[,1]),])))
} else {
print(atomblock(object))}
cat("\n<<bondblock>>", "\n", sep="")
if(length(bondblock(object)[,1])>=5) {
print(as.data.frame(rbind(bondblock(object)[1:2,], ...=rep("...", length(bondblock(object)[1,])),
bondblock(object)[(length(bondblock(object)[,1])-1):length(bondblock(object)[,1]),])))
} else {
print(bondblock(object))}
cat("\n<<datablock>> (", length(datablock(object)), " data items)", "\n", sep="")
if(length(datablock(object))>=5) {
print(c(datablock(object)[1:4], "..."))
} else {
print(datablock(object))}
})
## Behavior of "[" operator for SDF
setMethod(f="[", signature="SDF",
definition=function(x, i, ..., drop) {
return(sdf2list(x)[i])
})
## Replacement method for SDF using "[" operator
setReplaceMethod(f="[", signature="SDF", definition=function(x, i, j, value) {
if(i==1) x@header <- value
if(i==2) x@atomblock <- value
if(i==3) x@bondblock <- value
if(i==4) x@datablock <- value
if(i=="header") x@header <- value
if(i=="atomblock") x@atomblock <- value
if(i=="bondblock") x@bondblock <- value
if(i=="datablock") x@datablock <- value
obmolRef=NULL
return(x)
})
## Behavior of "[[" operator for SDF
setMethod(f="[[", signature="SDF",
definition=function(x, i, ..., drop) {
return(sdf2list(x)[[i]])
})
## Replacement method for SDF using "[[" operator
setReplaceMethod(f="[[", signature="SDF", definition=function(x, i, j, value) {
if(i==1) x@header <- value
if(i==2) x@atomblock <- value
if(i==3) x@bondblock <- value
if(i==4) x@datablock <- value
if(i=="header") x@header <- value
if(i=="atomblock") x@atomblock <- value
if(i=="bondblock") x@bondblock <- value
if(i=="datablock") x@datablock <- value
obmolRef=NULL
return(x)
})
## Coerce Methods for SDF Class
## Character vector to SDF
setAs(from="character", to="SDF",
def=function(from) {
.sdfParse(sdf=from)
})
## SDF to list
setAs(from="SDF", to="list",
def=function(from) {
list(header=header(from), atomblock=atomblock(from), bondblock=bondblock(from), datablock=datablock(from))
})
## SDF to SDFstr
setAs(from="SDF", to="SDFstr",
def=function(from) {
new("SDFstr", a=list(as(from, "character")))
})
## list (w. SDF components) to SDF
setAs(from="list", to="SDF",
def=function(from) {
new("SDF", bondblock=from$bondblock, header=from$header, atomblock=from$atomblock, datablock=from$datablock)
})
#################################################
## ExtSDF
#################################################
setClass("ExtSDF",representation(extendedAtomAttributes="list",
extendedBondAttributes="list"),contains="SDF")
setGeneric(name="getAtomAttr", def=function(x,atomId,tag) standardGeneric("getAtomAttr"))
setMethod(f="getAtomAttr", signature="ExtSDF",definition =
function(x,atomId,tag) x@extendedAtomAttributes[[atomId]][[tag]])
setGeneric(name="getBondAttr", def=function(x,bondId,tag) standardGeneric("getBondAttr"))
setMethod(f="getBondAttr", signature="ExtSDF",definition =
function(x,bondId,tag) x@extendedBondAttributes[[bondId]][[tag]])
setMethod(f="show", signature="ExtSDF",
definition=function(object) {
callNextMethod(object)
nonEmptyAtoms = which(sapply(object@extendedAtomAttributes,
function(x) length(x)!=0))
message("<<Extended Atom Attributes>>")
count = min(5,length(nonEmptyAtoms))
for(i in nonEmptyAtoms[seq(1,count,length.out=count)]){
message("Atom ",i)
print(object@extendedAtomAttributes[[i]])
}
nonEmptyBonds = which(sapply(object@extendedBondAttributes,
function(x) length(x)!=0))
message("<<Extended Bond Attributes>>")
count = min(5,length(nonEmptyBonds))
for(i in nonEmptyBonds[seq(1,count,length.out=count)]){
message("Bond ",i)
print(object@extendedBondAttributes[[i]])
}
})
#################################################
## (3) Class and Method Definitions for SDFset ##
#################################################
setClass("SDFset", representation(SDF="list", ID="character"))
## Store many SDFs in SDFset Object
read.SDFset <- function(sdfstr=sdfstr,skipErrors=FALSE, ...) {
## If a file name is provided run read.SDFstr function first
if(is.character(sdfstr)) {
sdfstr <- read.SDFstr(sdfstr=sdfstr)
}
## Iterate over SDFstr components
sdfset <- lapply(seq(along=sdfstr@a),
function(x){
tryCatch(
.sdfParse(sdfstr2list(sdfstr)[[x]], ...),
error=function(error){
if(skipErrors){
warning("failed to parse item ",x,": ",error)
NA
}else
stop("failed to parse item ",x,": ",error)
}
)
})
failedToParse <- is.na(sdfset)
if(sum(failedToParse) != 0)
warning("Failed to parse input compounds at indexe(s) (",paste(which(failedToParse),collapse=", "),
"). These have been removed from the output.")
sdfset <- sdfset[!failedToParse]
sdfset <- new("SDFset", SDF=sdfset, ID=paste("CMP", seq(along=sdfset), sep=""))
## Validity check of SDFs based on atom/bond block column numbers
badsdf <- sum(!validSDF(sdfset))
if(sum(badsdf)!=0) warning(paste(c(sum(badsdf), " invalid SDFs detected. To fix, run: valid <- validSDF(sdfset); sdfset <- sdfset[valid]")))
return(sdfset)
}
## Accessor methods for SDFset class
setGeneric(name="SDFset2list", def=function(x) standardGeneric("SDFset2list"))
setMethod(f="SDFset2list", signature="SDFset", definition=function(x) {
SDFlist <- x@SDF
charlist <- lapply(seq(along=SDFlist), function(x) as(SDFlist[[x]], "list"))
names(charlist) <- x@ID
return(charlist)
})
setGeneric(name="SDFset2SDF", def=function(x) standardGeneric("SDFset2SDF"))
setMethod(f="SDFset2SDF", signature="SDFset", definition=function(x) { tmp <- x@SDF; names(tmp) <- x@ID; return(tmp)})
setMethod(f="header", signature="SDFset", definition=function(x) { return(lapply(SDFset2SDF(x), header))})
setMethod(f="sdfid", signature="SDFset", definition=function(x, tag=1) {return(as.vector(sapply(SDFset2SDF(x), sdfid, tag)))})
setGeneric(name="cid", def=function(x) standardGeneric("cid"))
setMethod(f="cid", signature="SDFset", definition=function(x) {return(x@ID)})
setMethod(f="atomblock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), atomblock))})
setMethod(f="atomcount", signature="SDFset", definition=function(x, addH, ...) {
atomcounts <- lapply(SDFset2SDF(x), function(y) atomcount(y, addH, ...))
return(atomcounts)
})
setMethod(f="bondblock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), bondblock))})
setMethod(f="datablock", signature="SDFset", definition=function(x) {return(lapply(SDFset2SDF(x), datablock))})
setMethod(f="datablocktag", signature="SDFset", definition=function(x, tag) {return(as.vector(sapply(SDFset2SDF(x), datablocktag, tag)))})
setMethod(f="obmol",signature="SDFset",definition= function(x) lapply(SDFset2SDF(x),obmol))
## Replacement method for SDF component of SDFset using accessor methods
setGeneric(name="SDFset2SDF<-", def=function(x, value) standardGeneric("SDFset2SDF<-"))
setReplaceMethod(f="SDFset2SDF", signature="SDFset", definition=function(x, value) {
x@SDF <- value
return(x)
})
## Replacement method for ID component of SDFset using accessor methods
setGeneric(name="cid<-", def=function(x, value) standardGeneric("cid<-"))
setReplaceMethod(f="cid", signature="SDFset", definition=function(x, value) {
x@ID <- value
if(any(duplicated(x@ID))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
}
return(x)
})
## Replacement method for molecule name of each SDF in an SDFset using accessor methods
setGeneric(name="sdfid<-", def=function(x, value) standardGeneric("sdfid<-"))
setReplaceMethod(f="sdfid", signature="SDFset", definition=function(x, value) {
if(length(x@SDF) != length(value)){
stop("length of assigned value does not match length of assignee")
}
for(i in seq_along(x@SDF)){
sdfid(x[[i]]) = value[i]
}
return(x)
})
## Replacement method for SDFset using "[" operator
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SDFset", definition=function(x, i, j, value) {
x@SDF[i] <- SDFset2SDF(value)
return(x)
})
## Behavior of "[" operator for SDFset
setMethod(f="[", signature="SDFset", definition=function(x, i, ..., drop) {
if(is.logical(i)) {
i <- which(i)
}
if(is.character(i)) {
ids <-seq(along=x@ID); names(ids) <- x@ID
i <- ids[i]
}
x@SDF <- x@SDF[i]
x@ID <- x@ID[i]
if(any(duplicated(i))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
}
return(x)
})
## Replacement method for SDFset using "[" operator
setReplaceMethod(f="[", signature="SDFset", definition=function(x, i, value) {
x@SDF[i] <- value
return(x)
})
## Replacement method for SDFset using "[[" operator
setReplaceMethod(f="[[", signature="SDFset", definition=function(x, i, value) {
x@SDF[[i]] <- value
return(x)
})
## Behavior of "[[" operator for SDFset to convert single SDFset component to SDF
setMethod(f="[[", signature="SDFset", definition=function(x, i, ..., drop) {
if(is.character(i)) { i <- which(x@ID %in% i) }
return(x@SDF[[i]])
})
## Batch replacement of header, atomblock, bondblock and datablock sections for
## one to all SDF objects in an SDFset.
.gsubsdfsec <- function(sdfset, what, secdata) {
if(any(class(secdata) %in% c("data.frame", "matrix"))) {
secdata <- as.matrix(secdata)
tmp <- as.list(seq(along=secdata[,1]))
for(i in seq(along=tmp)) {
vec <- as.character(secdata[i,])
names(vec) <- colnames(secdata)
tmp[[i]] <- vec
}
secdata <- tmp
}
if(length(sdfset) != length(secdata)) { stop("The length of the two data sets needs to match.") }
sdfsec <- list(header=header(sdfset), atomblock=atomblock(sdfset), bondblock=bondblock(sdfset), datablock=datablock(sdfset))
sdfsec[[what]] <- secdata
sdflist <- lapply(seq(along=sdfsec[[1]]), function(x) as(list(header=sdfsec$header[[x]], atomblock=sdfsec$atomblock[[x]], bondblock=sdfsec$bondblock[[x]], datablock=sdfsec$datablock[[x]]), "SDF"))
names(sdflist) <- cid(sdfset)
return(as(sdflist, "SDFset"))
}
setGeneric(name="header<-", def=function(x, value) standardGeneric("header<-"))
setReplaceMethod(f="header", signature="SDFset", definition=function(x, value) {
sdfset <- .gsubsdfsec(sdfset=x, what=1, secdata=value)
return(sdfset)
})
setGeneric(name="atomblock<-", def=function(x, value) standardGeneric("atomblock<-"))
setReplaceMethod(f="atomblock", signature="SDFset", definition=function(x, value) {
sdfset <- .gsubsdfsec(sdfset=x, what=2, secdata=value)
return(sdfset)
})
setGeneric(name="bondblock<-", def=function(x, value) standardGeneric("bondblock<-"))
setReplaceMethod(f="bondblock", signature="SDFset", definition=function(x, value) {
sdfset <- .gsubsdfsec(sdfset=x, what=3, secdata=value)
return(sdfset)
})
setGeneric(name="datablock<-", def=function(x, value) standardGeneric("datablock<-"))
setReplaceMethod(f="datablock", signature="SDFset", definition=function(x, value) {
sdfset <- .gsubsdfsec(sdfset=x, what=4, secdata=value)
return(sdfset)
})
## Length function
setMethod(f="length", signature="SDFset",
definition=function(x) {
return(length(SDFset2SDF(x)))
})
## Print behavior for SDFset
setMethod(f="show", signature="SDFset",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
if(any(duplicated(object@ID))) {
warning("The values in the CMP ID slot are not unique. To fix this, run: cid(sdfset) <- makeUnique(cid(sdfset))")
}
})
## Concatenate function for SDFset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="SDFset", definition=function(x, y) {
sdflist1 <- as(x, "SDF")
sdflist2 <- as(y, "SDF")
sdflist <- c(sdflist1, sdflist2)
sdfset <- as(sdflist, "SDFset")
if(any(duplicated(cid(sdfset)))) {
warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
}
return(sdfset)
})
## Convert SDF to SDFstr Class for Export to File
## Function allows to customize output via optional arguments
setGeneric(name="sdf2str", def=function(sdf, head, ab, bb, db, cid=NULL, sig=FALSE, ...) standardGeneric("sdf2str"))
setMethod(f="sdf2str", signature="SDF", definition = function(sdf, head, ab, bb, db, cid=NULL, sig=FALSE, ...) {
## Checks
if(class(sdf)!="SDF" && class(sdf) != "ExtSDF") stop("Function expects molecule object of class SDF as input!")
if(.hasSlot(sdf,"version") && sdf@version == "V3000"){
if(extends(class(sdf),"ExtSDF"))
return(writeV3000(sdf,head,ab,bb,db,cid,sig))
else #not enough info for V3000 output, but need to at least fix count line to make compatible with V2000
sdf[[1]]["Counts_Line"] = paste0("", nrow(atomblock(sdf)), " ", nrow(bondblock(sdf)), " 0 0 0 0 0 0 0 0999 V2000")
}
## Header
if(missing(head)) {
head <- as.character(sdf[[1]])
if(sig==TRUE) head[2] <- paste("ChemmineR-", format(Sys.time(), "%m%d%y%H%M"), "XD", sep="")
if(length(cid)==1) head[1] <- cid
}
## Atom block
if(missing(ab)) {
ab <- sdf[[2]]
#ab <- cbind(Indent="", format(ab[,1:3], width=9, justify="right"), A=format(gsub("_.*", "", rownames(ab)), width=1, justify="right"), Space="", format(ab[,-c(1:3)], width=2, justify="right")) # Changed on 26-Aug-13
ab <- cbind(Indent="", format(ab[,1:3], width=9, justify="right"), A=format(gsub("_.*", "", rownames(ab)), width=1, justify="left"), Space="", format(ab[,-c(1:3)], width=2, justify="right"))
ab <- sapply(seq(along=ab[,1]), function(x) paste(ab[x, ], collapse=" "))
}
## Bond block
if(missing(bb)) {
bb <- sdf[[3]]
bb <- cbind(Indent="", format(bb, width=3, justify="right"))
bb <- sapply(seq(along=bb[,1]), function(x) paste(bb[x, ], collapse=""))
}
## Data block
if(missing(db)) {
db <- sdf[[4]]
if(length(db)>0) {
dbnames <- paste("> <", names(db), ">", sep="")
dbvalues <- gsub(" __ ",if(.Platform$OS.type=="unix")"\n" else "\r\n", as.character(db))
db <- as.vector(rbind(dbnames, dbvalues, ""))
} else {
db <- NULL
}
}
## Assemble in character vector
sdfstrvec <- c(head, ab, bb, "M END", db, "$$$$")
return(sdfstrvec)
})
writeV3000 <- function(sdf,head,ab,bb,db,cid=NULL, sig=FALSE) {
## Header
if(missing(head)) {
head <- as.character(sdf[[1]])
if(sig==TRUE) head[2] <- paste("ChemmineR-", format(Sys.time(), "%m%d%y%H%M"), "XD", sep="")
if(length(cid)==1) head[1] <- cid
}
head[4] = "0 0 0 0 0 999 V3000"
## Atom block
if(missing(ab)) {
ab <- sdf[[2]]
ab <- cbind(Prefix="M V30",
seq(along=ab[,1]),
A=gsub("_.*", "", rownames(ab)), # molecule name
ab[,1:3], #coordinates
rep(0,length(ab[,1])),
attributesAsString(sdf@extendedAtomAttributes)
)
ab <- sapply(seq(along=ab[,1]), function(x) paste(ab[x, ], collapse=" "))
}
## Bond block
if(missing(bb)) {
bb <- sdf[[3]]
bb <- cbind(Prefix="M V30",
seq(along=bb[,1]),
bb[,c(3,1,2)],
attributesAsString(sdf@extendedBondAttributes)
)
bb <- sapply(seq(along=bb[,1]), function(x) paste(bb[x, ], collapse=" "))
}
## Data block
if(missing(db)) {
db <- sdf[[4]]
if(length(db)>0) {
dbnames <- paste("> <", names(db), ">", sep="")
dbvalues <- gsub(" __ ",if(.Platform$OS.type=="unix")"\n" else "\r\n", as.character(db))
db <- as.vector(rbind(dbnames, dbvalues, ""))
} else {
db <- NULL
}
}
miscNumbers = strsplit(sdf[[1]]["Counts_Line"],split="\\s+",)[[1]][6:8]
counts = paste("M V30 COUNTS",length(ab),length(bb),miscNumbers[1],miscNumbers[2],miscNumbers[3] )
## Assemble in character vector
sdfstrvec <- c(head,
"M V30 BEGIN CTAB",
counts,
"M V30 BEGIN ATOM",
ab,
"M V30 END ATOM",
"M V30 BEGIN BOND",
bb,
"M V30 END BOND",
"M V30 END CTAB",
"M END",
db,
"$$$$")
return(sdfstrvec)
}
## Coerce Methods for SDFset Class
## SDFset to list with lists of SDF sub-components
setAs(from="SDFset", to="list",
def=function(from) {
SDFset2list(from)
})
## SDFset to list with many SDF objects (for summary view)
setAs(from="SDFset", to="SDF",
def=function(from) {
SDFset2SDF(from)
})
setGeneric(name="view", def=function(x) standardGeneric("view"))
setMethod(f="view", signature="SDFset", definition=function(x) { as(x, "SDF") })
## SDFstr to SDFset
setAs(from="SDFstr", to="SDFset",
def=function(from) {
read.SDFset(sdfstr=from)
})
## SDF to SDFset of length one
setAs(from="SDF", to="SDFset",
def=function(from) {
new("SDFset", SDF=list(from), ID="CMP")
})
## SDF to SDFstr
setAs(from="SDF", to="SDFstr",
def=function(from) {
as(as(from, "SDFset"), "SDFstr")
})
## SDFset to character for SDFstr coercion
setAs(from="SDF", to="character",
def=function(from) {
sdf2str(sdf=from)
})
## SDFset to SDFstr
setAs(from="SDFset", to="SDFstr",
def=function(from) {
from <- lapply(seq(along=from), function(x) as(from[[x]], "character"))
new("SDFstr", a=from)
})
## List of SDFs to SDFset
setAs(from="list", to="SDFset",
def=function(from) {
new("SDFset", SDF=from, ID=names(from))
})
## User interface to SDFset() constructor
SDFset <- function(SDFlist=list(), ID=character()) {
new("SDFset", SDF=SDFlist, ID=ID)
}
setClass("SDFset", representation(SDF="list", ID="character"))
#########################################################
## (4) Class and Method Definitions for SMI and SMIset ##
#########################################################
## Define SMI and SMIset classes
setClass("SMI", representation(smiles="character"))
setClass("SMIset", representation(smilist="list"))
## Methods to return SMI/SMIset objects as character vector or list, respectively
setMethod(f="as.character", signature="SMI", definition=function(x) {return(x@smiles)})
setMethod(f="as.character", signature="SMIset", definition=function(x) {return(unlist(x@smilist))})
## Constructor methods
## Character vector to FP with: as(myvector, "SMI")
setAs(from="character", to="SMI",
def=function(from) {
new("SMI", smiles=from)
})
## List to SMIset with: as(mylist, "SMIset")
setAs(from="list", to="SMIset",
def=function(from) {
new("SMIset", smilist=from)
})
## Character to SMIset with: as(mychar, "SMIset")
setAs(from="character", to="SMIset",
def=function(from) {
new("SMIset", smilist=as.list(from))
})
## Accessor methods for SMIset
## Note: generic for cid() is defined under SDFset class section.
setMethod(f="cid", signature="SMIset", definition=function(x) { return(names(x@smilist)) })
## Replacement method for ID component of SMIset using accessor methods.
setReplaceMethod(f="cid", signature="SMIset", definition=function(x, value) {
names(x@smilist) <- value
if(any(duplicated(names(x@smilist)))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
}
return(x)
})
## Replacement method for SMIset using "[" operator
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="SMIset", definition=function(x, i, value) {
x@smilist[i] <- value
names(x@smilist) <- names(value)
return(x)
})
## Behavior of "[" operator for SMIset
setMethod(f="[", signature="SMIset", definition=function(x, i, ..., drop) {
if(is.logical(i)) {
i <- which(i)
}
x@smilist <- x@smilist[i]
if(any(duplicated(i))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
}
return(x)
})
## Behavior of "[[" operator for SMIset to convert single SMIset component to SMI
setMethod(f="[[", signature="SMIset", definition=function(x, i, ..., drop) {
if(is.character(i)) { i <- which(names(x@fpma) %in% i) }
return(new("SMI", smiles=x@smilist[[i]]))
})
## Length function
setMethod(f="length", signature="SMIset",
definition=function(x) {
return(length(as.character(x)))
})
## Define print behavior for SMIset
setMethod(f="show", signature="SMIset",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
if(any(duplicated(names(object@smilist)))) {
warning("The values in the CMP ID slot are not unique. To fix this, run: cid(smiset) <- makeUnique(cid(smiset))")
}
})
## Concatenate function for SMIset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="SMIset", definition=function(x, y) {
smilist1 <- as.list(x)
smilist2 <- as.list(y)
smilist <- c(smilist1, smilist2)
smiset <- as(smilist, "SMIset")
if(any(duplicated(cid(smiset)))) {
warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
}
return(smiset)
})
## Define print behavior for SMI
setMethod(f="show", signature="SMI",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
print(object@smiles)
})
## SMIset to list with many SMI objects (for summary view)
setAs(from="SMIset", to="SMI",
def=function(from) {
tmp <- lapply(seq(along=from), function(x) from[[x]])
names(tmp) <- cid(from)
return(tmp)
})
setMethod(f="view", signature="SMIset", definition=function(x) { as(x, "SMI") })
# view(smiset)
## Import SMILES Files and Store them as SMIset Objects
read.SMIset <- function(file, removespaces=TRUE, ...) {
smisettmp <- readLines(file, ...)
if(removespaces==TRUE) smisettmp <- gsub(" {1,}", "", smisettmp)
## Add compound names where they are missing
index <- !grepl("\t.{1,}$", smisettmp)
if(any(index)) smisettmp[index] <- paste(smisettmp[index], "\t", "CMP", which(index), sep="")
## Construct and return SMIset object
smiset <- gsub("\t.*$", "", smisettmp)
names(smiset) <- gsub("^.*\t", "", smisettmp)
smiset <- as(smiset, "SMIset")
return(smiset)
}
## Write SMI/SMIset Objects to File
write.SMI <- function(smi, file, cid=TRUE, ...) {
if(class(smi)=="SMI") smima <- cbind(as.character(smi), "CMP1")
if(class(smi)=="SMIset") smima <- cbind(as.character(smi), cid(smi), ...)
if(cid==TRUE) write.table(smima, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names = FALSE)
if(cid==FALSE) write.table(smima[ , 1, drop = TRUE], file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names = FALSE)
}
#######################################################
## (5) Class and Method Definitions for AP and APset ##
#######################################################
## Function to coerce SDF class to old non-S4 AP CMP object
SDF2apcmp <- function(SDF) {
atoms <- gsub("_.*", "", rownames(atomblock(SDF)))
u <- as.numeric(bondblock(SDF)[,1])
n_atoms <- length(atoms)
n_bonds <- length(u)
v <- as.numeric(bondblock(SDF)[,2])
t <- as.numeric(bondblock(SDF)[,3])
cmp = list(atoms=atoms, bonds=list(u=u, v=v, t=t), n_atoms=n_atoms, n_bonds=n_bonds)
# assume we have an SDF object, not an SDFset
sdfstr=as(as(SDF,"SDFstr"),"list")[[1]]
defs = paste(sdfstr,collapse="\n")
#defs = unlist(Map(function(x) paste(x,collapse="\n"), sdfstrList) )
d <- Descriptors()
if (Descriptors_parse_sdf(self=d, sdf=defs) == 0) {
stop("SDF format not available or SDF not well-formatted!")
return(list(n_atoms=0, n_bonds=0, desc_obj=NULL))
}
cmp$desc_obj=d
return(cmp)
}
## Define AP/APset S4 classes for single AP vector and AP list
setClass("AP", representation(AP="numeric"))
setClass("APset", representation(AP="list", ID="character"))
## Create instance of APset form SDFset
sdf2ap <- function(sdfset, type="AP",uniquePairs=TRUE) {
if(!class(sdfset) %in% c("SDF", "SDFset")) stop("Functions expects input of classes SDF or SDFset.")
if(class(sdfset)=="SDF") {
if(type=="AP") {
return(new("AP",
AP=genAPDescriptors(sdfset,uniquePairs)))
}
if(type=="character") {
return(paste(genAPDescriptors(sdfset,uniquePairs), collapse=", "))
}
}
if(class(sdfset)=="SDFset") {
aplist <- as.list(seq(along=sdfset))
exception <- FALSE
for(i in seq(along=aplist)) {
tmp <- try(genAPDescriptors(sdfset[[i]],uniquePairs))
if(length(tmp) > 0 & class(tmp)!="try-error") {
aplist[[i]] <- tmp
} else if(length(tmp) == 0 & class(tmp)!="try-error") {
aplist[[i]] <- 0 # Value to use if no atom pairs are returned
exception <- TRUE
} else if(class(tmp)=="try-error") {
aplist[[i]] <- 1 # Value to use if error is returned
exception <- TRUE
}
}
if(exception) {
warning("One or more compounds failed to return APs. To identify them, run: \n\t which(sapply(as(apset, \"list\"), length)==1)")
}
if(type=="AP") {
return(new("APset", AP=aplist, ID=cid(sdfset)))
}
if(type=="character") {
names(aplist) <- cid(sdfset)
return(sapply(aplist, paste, collapse=", "))
}
}
}
## Create AP Fingerprints
desc2fp <- function(x, descnames=1024, type="FPset") {
if(length(descnames) == 1) {
data(apfp)
descnames <- as.character(apfp$AP)[1:descnames]
}
if(class(x)=="APset") {
apfp <- matrix(0, nrow=length(x), ncol=length(descnames), dimnames=list(cid(x), descnames))
apsetlist <- ap(x)
for(i in cid(x)) apfp[i, descnames %in% as.character(apsetlist[[i]])] <- 1
} else if(class(x)=="list") {
apfp <- matrix(0, nrow=length(x), ncol=length(descnames), dimnames=list(names(x), descnames))
for(i in names(x)) apfp[i, descnames %in% as.character(x[[i]])] <- 1
} else {
stop("x needs to be of class APset or list")
}
if(type=="FPset") {
#return(as(apfp, "FPset"))
return(new("FPset",fpma=apfp,type="apfp"))
}
if(type=="matrix") {
return(apfp)
}
if(type=="character") {
return(sapply(rownames(apfp), function(x) paste(apfp[x,], collapse="")))
}
}
## Usage:
# apfpset <- desc2fp(x=apset, descnames=1024, type="matrix")
## Accessor methods for APset class
setGeneric(name="ap", def=function(x) standardGeneric("ap"))
setMethod(f="ap", signature="AP", definition=function(x) { return(x@AP) })
setMethod(f="ap", signature="APset", definition=function(x) { tmp <- x@AP; names(tmp) <- x@ID; return(tmp) })
setMethod(f="cid", signature="APset", definition=function(x) { return(x@ID) })
## Replacement method for ID component of APset using accessor methods.
## Note: generic for cid() is defined under SDFset class section.
setReplaceMethod(f="cid", signature="APset", definition=function(x, value) {
x@ID <- value
if(any(duplicated(x@ID))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
}
return(x)
})
## Replacement method for APset using "[" operator
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="APset", definition=function(x, i, j, value) {
x@AP[i] <- ap(value)
x@ID[i] <- cid(value)
return(x)
})
## Behavior of "[" operator for APset
setMethod(f="[", signature="APset", definition=function(x, i, ..., drop) {
if(is.logical(i)) {
i <- which(i)
}
if(is.character(i)) {
ids <- seq(along=x@ID)
names(ids) <- x@ID
i <- ids[i]
}
x@AP <- x@AP[i]
x@ID <- x@ID[i]
if(any(duplicated(i))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
}
return(x)
})
## Replacement method for APset using "[[" operator
setReplaceMethod(f="[[", signature="APset", definition=function(x, i, value) {
x@AP[[i]] <- value
return(x)
})
## Behavior of "[[" operator for APset to convert single APset component to AP
setMethod(f="[[", signature="APset", definition=function(x, i, ..., drop) {
if(is.character(i)) { i <- which(x@ID %in% i) }
return(new("AP", AP=x@AP[[i]]))
})
## Length function
setMethod(f="length", signature="APset",
definition=function(x) {
return(length(ap(x)))
})
## Print behavior for APset
setMethod(f="show", signature="APset",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\"", " with ", length(object), " molecules", "\n", sep="")
if(any(duplicated(object@ID))) {
warning("The values in the CMP ID slot are not unique. To fix this, run: cid(apset) <- makeUnique(cid(apset))")
}
})
## Concatenate function for APset
## Note: is currently limited to 2 arguments!
setMethod(f="c", signature="APset", definition=function(x, y) {
aplist1 <- as(x, "list")
aplist2 <- as(y, "list")
aplist <- c(aplist1, aplist2)
apset <- as(aplist, "APset")
if(any(duplicated(cid(apset)))) {
warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
}
return(apset)
})
## Define print behavior for AP
setMethod(f="show", signature="AP",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\"", "\n", sep="")
cat("<<atom pairs>>", "\n", sep="")
if(length(object@AP)>=5) {
cat(c(object@AP[1:5], "... length:", length(object@AP), "\n"))
} else {
print(object@AP)}
})
## Coerce Methods for APset Class
## APset to list with many AP objects
setAs(from="APset", to="list",
def=function(from) {
ap(from)
})
## List of APs to APset
setAs(from="list", to="APset",
def=function(from) {
new("APset", AP=from, ID=names(from))
})
## APset to list with many AP objects (for summary view)
setAs(from="APset", to="AP",
def=function(from) {
tmp <- lapply(seq(along=from), function(x) from[[x]])
names(tmp) <- cid(from)
return(tmp)
})
setMethod(f="view", signature="APset", definition=function(x) { as(x, "AP") })
# view(apset)
## Coerce APset to old list-style descriptor database used by search/cluster functions
apset2descdb <- function(apset) {
list(descdb=ap(apset), cids=cid(apset), sdfsegs=NULL, source="SDFset", type="SDFset")
}
#######################################################
## (6) Class and Method Definitions for FP and FPset ##
#######################################################
## Define FP and FPset classes
setClass("FP", representation(fp="numeric",type="character",foldCount="numeric"),
prototype=list(foldCount=0))
setMethod(f="initialize", signature="FP",definition= function(.Object, ...){
obj <- callNextMethod(.Object, ...)
if(length(obj@type)==0)
obj@type=paste("unknown",as.integer(runif(1)*10000),sep="-")
obj
})
setClass("FPset", representation(fpma="matrix",type="character",foldCount="numeric"),
prototype=list(foldCount=0))
setMethod(f="initialize", signature="FPset",definition= function(.Object, ...){
obj <- callNextMethod(.Object, ...)
if(length(obj@type)==0)
obj@type=paste("unknown",as.integer(runif(1)*10000),sep="-")
obj
})
## Methods to return FPSet as vector or matrix, respectively
setMethod(f="as.vector", signature="FP", definition=function(x) {return(x@fp)})
setMethod(f="as.numeric", signature="FP", definition=function(x) {return(x@fp)})
setMethod(f="as.character", signature="FP", definition=function(x) {return(paste(x@fp, collapse=""))})
setMethod(f="as.matrix", signature="FPset", definition=function(x) {return(x@fpma)})
setMethod(f="as.character", signature="FPset", definition=function(x) {sapply(rownames(x@fpma), function(y) paste(x@fpma[y,], collapse=""))})
## Constructor methods
## Numeric vector to FP with: as(myvector, "FP")
setAs(from="numeric", to="FP",
def=function(from) {
new("FP", fp=from)
})
## Matrix to FPset with: as(mymatrix, "FPset")
setAs(from="matrix", to="FPset",
def=function(from) {
new("FPset", fpma=from)
})
## Character to FPset with: as(mychar, "FPset")
setAs(from="character", to="FPset",
def=function(from) {
read.AP(from, type="fp")
})
## Accessor methods for FPset
## Note: generic for cid() is defined under SDFset class section.
setMethod(f="cid", signature="FPset", definition=function(x) { return(rownames(x@fpma)) })
## Replacement method for ID component of FPset using accessor methods.
setReplaceMethod(f="cid", signature="FPset", definition=function(x, value) {
rownames(x@fpma) <- value
if(any(duplicated(rownames(x@fpma)))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(fpset) <- makeUnique(cid(fpset))")
}
return(x)
})
foldVector <- function(v,count=1,bits=NULL){
len = length(v)
if(len%%2!=0)
stop("must have an even number of bits to fold")
if(!is.null(bits))
count=foldsNeeded(len,bits)
#message("count: ",count," len: ",len)
realCount=0
for(i in rep(0,count)){
if(len==1)
break
mid = len/2
#message("mid: ",mid)
v = mapply(function(a,b) if(a||b)1 else 0,v[1:mid],v[(mid+1):len])
len = length(v)
realCount=realCount+1
}
if(count!=realCount)
warning(count," folds requested but could only fold ",realCount," times")
list(fp=v,actualFoldCount=realCount)
}
foldsNeeded <- function(length,bits){
count = log2(length) - log2(bits)
if(count != as.integer(count))
stop("not possible to achieve ",bits," with an integral number of folds")
count
}
setGeneric(name="fold", def=function(x,count=1,bits=NULL) standardGeneric("fold"))
setMethod(f="fold",signature="FP", definition=function(x,count,bits){
result = foldVector(x@fp,count,bits)
x@fp = result$fp
x@foldCount = x@foldCount+result$actualFoldCount
x
})
setMethod(f="fold",signature="FPset", definition=function(x,count,bits) {
actualFolds=0
data = apply(x@fpma,c(1),function(y){
result = foldVector(y,count,bits)
actualFolds <<- result$actualFoldCount #just keep the last one, they should all be the same
result$fp })
if(!is.matrix(data)) # account for case when fp gets folded down to 1
dim(data) = c(nrow(x@fpma),1)
else
data = t(data)
x@fpma = data
x@foldCount = x@foldCount+actualFolds
x
})
setGeneric(name="fptype", def=function(x) standardGeneric("fptype"))
setMethod(f="fptype", signature="FP",definition=function(x) x@type)
setMethod(f="fptype", signature="FPset",definition=function(x) x@type)
setGeneric(name="foldCount", def=function(x) standardGeneric("foldCount"))
setMethod(f="foldCount", signature="FP",definition=function(x) x@foldCount)
setMethod(f="foldCount", signature="FPset",definition=function(x) x@foldCount)
setGeneric(name="numBits", def=function(x) standardGeneric("numBits"))
setMethod(f="numBits", signature="FP",definition=function(x) length(x@fp))
setMethod(f="numBits", signature="FPset",definition=function(x) ncol(x@fpma))
## Replacement method for FPset using "[" operator
## It doesn't provide here full set of expected functionalities.
setReplaceMethod(f="[", signature="FPset", definition=function(x, i, value) {
x@fpma[i,] <- value
return(x)
})
## Behavior of "[" operator for FPset
setMethod(f="[", signature="FPset", definition=function(x, i, ..., drop) {
if(is.logical(i)) {
i <- which(i)
}
x@fpma <- x@fpma[i, , drop=FALSE]
if(any(duplicated(i))) {
warning("The values in the CMP ID slot are not unique anymore. To fix this, run: cid(fpset) <- makeUnique(cid(fpset))")
}
return(x)
})
## Behavior of "[[" operator for FPset to convert single FPset component to FP
setMethod(f="[[", signature="FPset", definition=function(x, i, ..., drop) {
if(is.character(i)) { i <- which(rownames(x@fpma) %in% i) }
return(new("FP", fp=x@fpma[i,]))
})
## Length function
setMethod(f="length", signature="FPset",
definition=function(x) {
return(length(as.matrix(x)[,1]))
})
## Define print behavior for FPset
setMethod(f="show", signature="FPset",
definition=function(object) {
cat("An instance of a ", length(object@fpma[1,]), " bit ", "\"", class(object),
"\" of type \"",object@type,"\" with ",
length(object@fpma[,1]), " molecules", "\n", sep="")
})
setMethod(f="c",signature="FP",definition=function(x,...){
args = list(x,...)
type=NULL
length=0
fpma = t(sapply(args,function(obj){
if(!inherits(obj,"FP"))
stop("all arguments must be of type FP")
if(is.null(type))
type<<-obj@type
if(length==0)
length <<- numBits(obj)
if(obj@type != type)
stop("found differing types: ",obj@type," and ",type)
if(numBits(obj) != length)
stop("found differing number of bits: ",numBits(obj)," and ",length)
obj@fp
}))
fpset=new("FPset",fpma=fpma,type=type)
if(any(duplicated(cid(fpset))))
warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
fpset
})
## Concatenate function for FPset
setMethod(f="c", signature="FPset", definition=function(x, ...) {
args = list(x,...)
type=NULL
length=0
fpma = Reduce(rbind,Map(function(obj){
if(!inherits(obj,"FPset"))
stop("all arguments must be of type FPset")
if(is.null(type))
type<<-obj@type
if(length==0)
length <<- numBits(obj)
if(obj@type != type)
stop("found differing types: ",obj@type," and ",type)
if(numBits(obj) != length)
stop("found differing number of bits: ",numBits(obj)," and ",length)
obj@fpma
},args))
fpset=new("FPset",fpma=fpma,type=type)
if(any(duplicated(cid(fpset))))
warning("The values in the CMP ID slot are not unique anymore, makeUnique() can fix this!")
fpset
})
## Define print behavior for FP
setMethod(f="show", signature="FP",
definition=function(object) {
cat("An instance of ", "\"", class(object), "\" of type \"",object@type, "\"\n", sep="")
cat("<<fingerprint>>", "\n", sep="")
if(length(object@fp)>=20) {
cat(c(object@fp[1:20], "... length:", length(object@fp), "\n"))
} else {
print(object@fp)}
})
## FPset to list with many FP objects (for summary view)
setAs(from="FPset", to="FP",
def=function(from) {
tmp <- lapply(seq(along=from), function(x) from[[x]])
names(tmp) <- cid(from)
return(tmp)
})
setMethod(f="view", signature="FPset", definition=function(x) { as(x, "FP") })
# view(fpset)
###################
## (7) Utilities ##
###################
#################################################
## (6.1) Detect Invalid SDFs in SDFset Objects ##
#################################################
validSDF <- function(x, Nabcol = 3, Nbbcol = 3, logic="&", checkNA=TRUE) {
if(class(x)!="SDFset")
warning("x needs to be of class SDFset")
ab <- atomblock(x);
abcol <- sapply(names(ab), function(x) length(ab[[x]][1,]))
bb <- bondblock(x);
bbcol <- sapply(names(bb), function(x) length(bb[[x]][1,]))
if(logic=="|")
validsdf <- abcol >= Nabcol | bbcol >= Nbbcol
if(logic=="&")
validsdf <- abcol >= Nabcol & bbcol >= Nbbcol
if(checkNA==TRUE) {
abNA <- sapply(names(ab), function(x) !any(is.na(ab[[x]])))
bbNA <- sapply(names(bb), function(x) !any(is.na(bb[[x]])))
validsdf <- validsdf & abNA & bbNA
}
return(validsdf)
}
######################################################################
## (6.2) Create Unique CMP Names by Appending a Counter to Duplates ##
######################################################################
makeUnique <- function(x, silent=FALSE) {
if(all(!duplicated(x))) {
if(silent!=TRUE) print("No duplicates detected!")
return(x)
} else {
count <- table(x); count <- count[x]
dupids <- count[count>1]; dupids <- dupids[!duplicated(names(dupids))]
for(i in seq(along=dupids)) {
names(count)[names(count) %in% names(dupids[i])] <- paste(names(dupids)[i], seq(1, dupids[i]), sep="_")
}
if(silent!=TRUE) print(paste("Counter appended to", length(dupids), "duplicates!"))
return(names(count))
}
}
###############################
## (6.3) Molecule Properties ##
###############################
## (6.3.1) Atom count matrix
atomcountMA <- function(x, ...) {
if(class(x)=="SDF") x <- as(x, "SDFset")
atomcountlist <- atomcount(x, ...)
columns <- unique(unlist(lapply(seq(along=atomcountlist), function(x) names(atomcountlist[[x]]))))
myMA <- matrix(NA, length(atomcountlist), length(columns), dimnames=list(NULL, columns))
for(i in seq(along=atomcountlist)) myMA[i, names(atomcountlist[[i]])] <- atomcountlist[[i]]
myMA[is.na(myMA)] <- 0
rownames(myMA) <- names(atomcountlist)
return(myMA)
}
## (6.3.2) Molecular weight (MW data from http://iupac.org/publications/pac/78/11/2051/)
data(atomprop); atomprop=atomprop
MW <- function(x, mw=atomprop, ...) {
if(class(x)=="SDF") x <- as(x, "SDFset")
## Create MW vector with atom symbols in name slot
AW <- mw$Atomic_weight
names(AW) <- mw$Symbol
## Calculate MW
propma <- atomcountMA(x, ...)
MW <- rowSums(t(t(propma) * AW[colnames(propma)]), na.rm = TRUE)
return(MW)
}
## (6.3.3) Molecular formula
MF <- function(x, ...) {
if(class(x)=="SDF") x <- as(x, "SDFset")
propma <- atomcountMA(x, ...)
propma <- propma[c(1, seq(along=propma[,1])),] # Duplicates first row to support processing of single molecule with same code
hillorder <- colnames(propma); names(hillorder) <- hillorder
hillorder <- na.omit(unique(hillorder[c("C", "H", sort(hillorder))]))
propma <- propma[, hillorder]
propma[propma==1] <- ""
MF <- paste(colnames(propma), t(propma), sep="")
propma <- matrix(MF, nrow=length(propma[,1]), ncol=length(propma[1,]), dimnames=list(rownames(propma), colnames(propma)), byrow=TRUE)
MF <- seq(along=propma[,1]); names(MF) <- rownames(propma)
zeroma <- matrix(grepl("[\\*A-Za-z]0$", propma), nrow=length(propma[,1]), ncol=length(propma[1,]), dimnames=list(rownames(propma), colnames(propma)))
propma[zeroma] <- ""
for(i in seq(along=MF)) { MF[i] <- paste(propma[i,], collapse="") }
return(MF[-1]) # Minus one to remove duplicated entry in first row of propma
}
## (6.3.4) Ring Perception and Aromaticity Assignment
## Implements with some modifications the exhaustive ring perception algorithm
## from Hanser et al (1996). URL: http://pubs.acs.org/doi/abs/10.1021/ci960322f
## (a) Iterative removal of atoms with single non hydrogen bonds.
## Returns from molecule only its rings and their inter-connections.
.cyclicCore <- function(x) {
if(length(x) != 1) stop("x needs to be a single molecule")
if(class(x) == "SDFset") x <- x[[1]]
path <- conMA(x, exclude="H")
noconnect <- rowSums(path) != 0 # Removes atoms with no connections
path <- path[noconnect, noconnect]
if(all(dim(path) == 0)) { return(path) }
term <- which(rowSums(path > 0)==1)
while(length(term) > 0) {
path <- path[-term,-term]
if(any(dim(path) == 0) | is.vector(path)) { break() }
term <- which(rowSums(path > 0)==1)
}
return(path)
}
## (b) Function to return the longest possible linear bond paths where:
## - internal atoms have only two heavy atom neighbors
## - terminal atoms are atoms with more than two heavy atom neighbors or they are ring closures
.linearCon <- function(x) {
secatoms <- rowSums(x > 0) == 2
secatoms <- names(which(secatoms))
.linearCon <- function(vertex, x=x) {
con <- as.list(names(which(x[vertex, ] > 0)))
for(i in seq(along=con)) {
termatom <- con[[i]][length(con[[i]])]
termcon <- names(which(x[termatom, ] > 0))
termcon <- termcon[!termcon %in% vertex]
while(length(termcon) == 1 & !any(duplicated(con[[i]]))) {
con[[i]] <- c(con[[i]], termcon)
termatom <- con[[i]][length(con[[i]])]
termcon <- names(which(x[termatom, ] > 0))
termcon <- termcon[!termcon %in% con[[i]][length(con[[i]])-1]]
}
}
if(paste(sort(unique(con[[1]])), collapse="") == paste(sort(unique(con[[2]])), collapse="")) {
return(con[[1]])
} else {
return(c(rev(con[[1]]), vertex, con[[2]]))
}
}
linearconlist <- lapply(secatoms, function(y) .linearCon(vertex=y, x=x))
nodups <- !duplicated(sapply(linearconlist, function(y) paste(sort(unique(y)), collapse="")))
linearconlist <- linearconlist[nodups]
return(linearconlist)
}
## (c) Assemble intermediate results in a list
.update <- function(con, path) {
## Remove non-terminal atoms in each path
center_atoms <- unique(unlist(lapply(path, function(x) x[-c(1, length(x))])))
con1 <- con[!rownames(con) %in% center_atoms, !colnames(con) %in% center_atoms]
## Add atom pairs with three neighbors to connection list
if(is.matrix(con1)) {
remainbonds <- con1
remainbonds[lower.tri(remainbonds)] <- 0
remainbonds <- lapply(rownames(remainbonds), function(x) names(which(remainbonds[x,] > 0)))
names(remainbonds) <- rownames(con1)
remainbonds <- cbind(rep(names(remainbonds), sapply(remainbonds, length)), unlist(remainbonds, use.names=F))
remainbonds <- split(remainbonds, seq(along=remainbonds[,1]))
path <- c(path, remainbonds)
names(path) <- seq(along=path)
}
## Collect complete rings and remove them from path object
index <- unlist(lapply(path, function(y) any(duplicated(y))))
rings <- path[index]
path <- path[!index]
names(path) <- seq(along=path)
## Connection list for path component
conpath <- t(sapply(path, function(x) x[c(1, length(x))]))
ends <- unique(as.vector(conpath))
conpath <- lapply(ends, function(x) as.numeric(names(which(rowSums(conpath==x) > 0))))
names(conpath) <- ends
conpath <- conpath[sapply(conpath, length) > 1] # removes ends that occur only once
## Assemble results in list
return(list(con=con, conpath=conpath, path=path, rings=rings))
}
## (d) Return rings from cyclist object
.rings <- function(cyclist, upper=Inf) {
## Define data containers
pathlist <- cyclist$path
conpath <- cyclist$conpath
pathlistnew <- list()
rings <- list()
## Loop to join linear paths/fragments stored in pathlist
for(i in names(conpath)) {
if(length(conpath) == 0 | !any(names(conpath) == i)) { next() }
pos <- t(combn(conpath[[i]], m=2))
for(j in seq(along=pos[,1])) {
p1 <- pathlist[[pos[j,1]]]
p2 <- pathlist[[pos[j,2]]]
if(sum(p1[-c(1,length(p1))] %in% p2[-c(1,length(p2))]) > 0) {
next()
}
if(p1[1] == i & p2[1] == i) { # matching s1:s2
pathlistnew[[length(pathlistnew)+1]] <- c(rev(p2[-1]), p1)
}
if(p1[length(p1)] == i & p2[length(p2)] == i) { # matching e1:e2
pathlistnew[[length(pathlistnew)+1]] <- c(p1, rev(p2[-length(p2)]))
}
if(p1[1] == i & p2[length(p2)] == i) { # matching s1:e2
pathlistnew[[length(pathlistnew)+1]] <- c(p2, p1[-1])
}
if(p1[length(p1)] == i & p2[1] == i) { # matching e1:s2
pathlistnew[[length(pathlistnew)+1]] <- c(p1, p2[-1])
}
}
## Various postprocessing routines for joined fragments follow
if(length(pathlistnew) == 0) { next() }
## Remove duplicates
dups <- duplicated(sapply(pathlistnew, function(x) paste(sort(unique(x)), collapse="_")))
pathlistnew <- pathlistnew[!dups]
## Set maximum ring size; improves time performance if outer rings are not needed
if(upper != Inf) {
l <- sapply(pathlistnew, length)
pathlistnew <- pathlistnew[l <= upper]
if(length(pathlistnew) == 0) { next() }
}
## Collect complete rings and remove them from path object
index <- unlist(lapply(pathlistnew, function(y) any(duplicated(y[c(1, length(y))]))))
rings[[length(rings)+1]] <- pathlistnew[index]
pathlistnew <- pathlistnew[!index]
## Remove paths with internal duplicates
if(length(pathlistnew) > 0) {
index <- unlist(lapply(pathlistnew, function(y) any(duplicated(y))))
pathlistnew <- pathlistnew[!index]
}
## Update pathlist and conpath
pathlist <- c(pathlist[-conpath[[i]]], pathlistnew)
dups <- duplicated(sapply(pathlist, function(x) paste(sort(unique(x)), collapse="_")))
pathlist <- pathlist[!dups]
names(pathlist) <- seq(along=pathlist)
conpath <- t(sapply(pathlist, function(x) x[c(1, length(x))]))
ends <- unique(as.vector(conpath))
conpath <- lapply(ends, function(x) as.numeric(names(which(rowSums(conpath==x) > 0))))
names(conpath) <- ends
conpath <- conpath[sapply(conpath, length) > 1] # removes ends that occur only once
pathlistnew <- list()
}
## Generate proper output format
rings <- unlist(rings, recursive=FALSE)
dups <- duplicated(sapply(rings, function(x) paste(sort(unique(x)), collapse="_")))
rings <- c(cyclist$rings, rings[!dups])
l <- sapply(rings, length)
rings <- rings[order(l)]
if(upper != Inf) { rings <- rings[l <= upper] }
if(length(rings) > 0) {
names(rings) <- paste("ring", seq(along=rings), sep="")
} else {
rings <- NULL
}
return(rings)
}
## (e) Identify inner rings
## Expected input x is list of rings from call:
## x <- rings(x=sdfset[[1]], upper=Inf, type="all", arom=FALSE)
.is.inner <- function(x) {
rnames <- rev(names(x)); names(rnames) <- rnames
r <- x
names(r) <- paste(names(r), "_", sep="")
r <- unlist(r)
for(i in rnames) {
tmp <- x[[i]]
r2 <- r[!gsub("_.*", "", names(r)) %in% i]
if(all(tmp %in% r2)) {
rnames[i] <- "redundant"
r <- r2
}
}
return(x[rev(rnames!="redundant")])
}
## (f) Aromaticity assignment for rings
## Approach
## (i) Identify rings where all atoms are sp2 hybridized. This means each atom has a
## double bond or at least one lone electron pair and is attached to a sp2 hybridized atom
## (ii) Hueckel's rule needs to be true (4n+2=integer): 2, 6, 10, 14, 18, ... pi electrons
## per ring.
.is.arom <- function(sdf, rings) {
if(length(rings)==0) { return(NULL) }
con <- conMA(sdf)
b2 <- bonds(sdf)
b <- b2[,"Nbondcount"] - b2[,"charge"]
names(b) <- paste(b2[,"atom"], "_", seq(along=b2[,1]), sep="")
.neighborsFct <- function(x) { sapply(rownames(x), function(y) paste(sort(paste(x[y,][x[y,]!=0], sep="_")), collapse="_")) }
## Determine aromaticity
.arom <- function(con, b, r=rings[[1]]) {
## Identify sp2 hybridized atoms
sp <- .neighborsFct(x=con)[r]
sp2 <- grepl("1_2$", sp)
## Identify atoms with lone electron pairs
el <- c(Al=3, As=5, At=7, B=3, Bi=5, Br=7, C=4, Cl=7, F=7, Ga=3, Ge=4, I=7, In=3, N=5, O=6, P=5, Pb=4, Po=6, S=6, Sb=5, Se=6, Si=4, Sn=4, Te=6, Tl=3)
bsub <- b[names(sp)]
lp <- el[gsub("_.*", "", names(bsub))] - bsub >= 2
lp[is.na(lp)] <- 0 # If an element is not specified under el, then its NA value in lp is set to zero
sp2lp <- all(sp2 | lp)
## Hueckel's rule
d <- rowSums(con[r, r]==2)
double <- sum(d)
if(length(d[lp]) != 0) {
lpcount <- sum(lp[d==0]) * 2
} else {
lpcount <- 0
}
n <- ((double + lpcount) - 2) / 4
is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol
hueckel <- is.wholenumber(n)
return(sp2lp & hueckel)
}
return(sapply(names(rings), function(x) .arom(con, b, r=rings[[x]])))
}
## Inner rings inherit aromaticity assignment from super rings if they are fully contained in them
## This is necessary since the aromaticity assignment for inner rings may miss context information required
## for Hueckel's rule. Note, for this to work the input data has to be generated with .rings(x, Inf)
.containedArom <- function(myrings, myarom) {
if(!all(names(myrings) %in% names(myarom))) stop("Both inputs need to have the same number of rings with identical names")
if(all(!myarom)) {
return(myarom)
} else {
nonaromrings <- myrings[!myarom] # Restricts loop to non-aromatic entries
aromrings <- myrings[myarom]
for(i in seq_along(nonaromrings)) {
updatearom <- sapply(aromrings, function(x) all(nonaromrings[[i]] %in% x))
if(any(updatearom)) myarom[names(nonaromrings[i])] <- TRUE
}
return(myarom)
}
}
## (g) Ring and aromaticity perception by running the functions (1)-(5)
rings <- function(x, upper=Inf, type="all", arom=FALSE, inner=FALSE) {
if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
if(inner==TRUE & upper!=Inf) stop("Inner ring prediction requires upper=Inf")
if(type=="arom" & arom==FALSE) stop("type='arom' requires arom=TRUE")
runAll <- function(x, upper) {
con <- .cyclicCore(x)
if(any(dim(con) == 0) | is.vector(con) | all(con == 0)) { # TRUE for linear compounds
if(arom==TRUE) {
if(type=="all") return(list(RINGS=NULL, AROMATIC=NULL))
if(type=="count") return(c(RINGS=0, AROMATIC=0))
} else {
if(type=="all") return(NULL)
if(type=="count") return(0)
}
} else {
path <- .linearCon(x=con)
cyclist <- .update(con=con, path=path)
if(arom==TRUE) {
myrings <- .rings(cyclist, Inf) # Full ring set required to identify remaining aromatic rings with .containedArom
myrings <- lapply(myrings, function(x) x[-1]) # Removes duplicated atom at ring closure
myarom <- .is.arom(sdf=x, rings=myrings)
myarom <- .containedArom(myrings, myarom)
if(upper==Inf & inner==TRUE) {
myringnames <- names(.is.inner(x=myrings)) # Returns names of inner rings only
myrings <- myrings[myringnames]
myarom <- myarom[myringnames]
}
if(upper!=Inf) {
uppernames <- names(myrings[sapply(myrings, length) <= upper])
myrings <- myrings[uppernames]
myarom <- myarom[uppernames]
}
if(type=="all") return(list(RINGS=myrings, AROMATIC=myarom))
if(type=="arom") return(list(AROMATIC_RINGS=myrings[myarom]))
if(type=="count") return(c(RINGS=length(myrings), AROMATIC=sum(myarom)))
} else {
myrings <- .rings(cyclist, upper+1) # Plus 'upper+1' is required because at this step all rings have duplicated atoms at ring closure
myrings <- lapply(myrings, function(x) x[-1]) # Removes duplicated atom at ring closure
if(upper==Inf & inner==TRUE) myrings <- .is.inner(x=myrings) # Reduces myrings to inner rings only
if(type=="all") return(myrings)
if(type=="count") return(length(myrings))
}
}
}
if(class(x)=="SDF") {
return(runAll(x, upper))
}
if(class(x)=="SDFset" & length(x) == 1) {
return(runAll(x[[1]], upper))
}
if(class(x)=="SDFset" & length(x) > 1) {
myrings <- lapply(1:length(x), function(y) runAll(x[[y]], upper))
names(myrings) <- cid(x)
if(type=="all" | type=="arom") return(myrings)
if(type=="count") {
mycol <- c("RINGS", "AROMATIC")
return(matrix(unlist(myrings), ncol=length(myrings[[1]]), dimnames=list(names(myrings), mycol[1:length(myrings[[1]])]), byrow=T))
}
}
}
## Usage:
# rings(sdfset[1:4], upper=6, type="all", arom=TRUE)
# rings(sdfset[1:4], upper=6, type="arom", arom=TRUE)
# rings(sdfset[1:4], upper=6, type="count", arom=TRUE)
# plot(sdfset[1], print=F, atomnum=T, no_print_atoms="H")
# plot(sdfset[1:4], print=F, atomnum=T, no_print_atoms="H")
## (6.3.5) Enumerate Functional Groups
## (a) Generate neighbor information for each heavy atom in a molecule
.neighbors <- function(x, type="countMA") {
## Input checks
if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
if(!any(c("all", "count", "countMA") %in% type)) stop("type can only be assigned: all, count or countMA")
## Return neighbors
.neighborsFct <- function(x, type=type) {
colnames(x) <- paste(colnames(x), colSums(x>0), sep="_") # Adds number of bonded heavy atom neighbors (non-hydrogens)
neighbors <- sapply(rownames(x), function(y) paste(sort(paste(gsub("_.*_", "_", colnames(x)[x[y,]!=0]), x[y,][x[y,]!=0], sep="_")), collapse="_"))
if(type=="all") {
return(neighbors)
}
if(type=="count" | type=="countMA") {
return(table(paste(gsub("_.*", "", names(neighbors)), neighbors, sep=":")))
}
}
## Run on SDF object
if(class(x)=="SDF") {
x <- conMA(x, exclude="H")
neighbors <- .neighborsFct(x, type)
return(neighbors)
}
## Run on SDFset objects containing one or many molecules
if(class(x)=="SDFset") {
cid <- cid(x)
x <- conMA(x, exclude="H")
neighbor_set <- lapply(seq(along=x), function(y) .neighborsFct(x[[y]], type))
names(neighbor_set) <- cid
if(type=="all" | type=="count") {
return(neighbor_set)
}
if(type=="countMA") {
columns <- unique(unlist(lapply(seq(along=neighbor_set), function(x) names(neighbor_set[[x]]))))
myMA <- matrix(NA, length(neighbor_set), length(columns), dimnames=list(NULL, columns))
for(i in seq(along=neighbor_set)) myMA[i, names(neighbor_set[[i]])] <- neighbor_set[[i]]
myMA[is.na(myMA)] <- 0
rownames(myMA) <- cid
return(myMA)
}
}
}
## Usage:
# .neighbors(sdfset[1:4], type="all")
# .neighbors(sdfset[1:4], type="countMA")
## (b) Count functional groups
groups <- function(x, groups="fctgroup", type="countMA") {
## Input checks
if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
if(groups=="fctgroup" & (type=="count" | type=="all")) stop("when groups=\"fctgroup\", only type=\"countMA\" can be used")
mylength <- length(x)
## Support for single molecule objects
if(mylength==1) {
x <- as(x, "SDFset");
y <- x;
z <- x;
cid(z) <- "dummy";
x <- c(y, z)
}
## Generate neighbor counts
neighbors <- .neighbors(x, type)
## Return neighbor counts if requested
if(groups[1]=="neighbors") {
if(is.matrix(neighbors)) {
neighbors <- neighbors[!rownames(neighbors) %in% "dummy", ]
} else {
neighbors <- neighbors[!names(neighbors) %in% "dummy"]
}
return(neighbors)
}
## Count functional groups based on data stored in neighbor matrix
if(groups[1]=="fctgroup") {
groups <- c(RNH2="^C:.*N_1_1$", R2NH="^N:C_._1_C_._1$", R3N="^N:C_._1_C_._1_C_._1$",
ROPO3="^P:O_._._O_._._O_._._O_._.$", ROH="(?=^C:.*O_1_1)(?=^(?:(?!(N|O|P|S)_._(2|3)).)*$)",
RCHO="^O:C_2_2", RCOR="^C:C_._1_C_._1.*O_1_2", RCOOH="^C:.*O_1_1_O_1_2",
RCOOR="^C:.*O_1_2_O_2_1", ROR="^O:.*C_._1_C_._1", RCCH="^C:C_2_3", RCN="^N:C_2_3")
}
groupMA <- sapply(names(groups), function(x) rowSums(neighbors[, rep(grep(groups[x], colnames(neighbors), perl=TRUE),2)]/2))
## Fix counts for ambiguous functional groups
if(c("ROR" %in% colnames(groupMA))) {
groupMA[, "ROR"] <- groupMA[, "ROR"] - groupMA[, "RCOOR"]
groupMA[groupMA < 0] <- 0
}
if(mylength>1) {
return(groupMA)
} else {
return(groupMA[1,,drop=FALSE])
}
}
## Usage:
# groups(sdfset[1:20], groups="fctgroup", type="countMA")
# groups(sdfset[1:4], groups="neighbors", type="countMA")
# groups(sdfset[1:4], groups="neighbors", type="count")
# groups(sdfset[1:4], groups="neighbors", type="all")
##############################################################
## (6.4) Convert SDF Tail to Numeric and Character Matrices ##
##############################################################
## (6.4.1) Store everything in one character matrix
datablock2ma <- function(datablocklist, cleanup=" \\(.*", ...) {
if(exists("cleanup"))
for(i in seq(along=datablocklist))
names(datablocklist[[i]]) <- gsub(cleanup, "", names(datablocklist[[i]])) # Required if name tags contain compound ids
columns <- unique(unlist(lapply(seq(along=datablocklist), function(x) names(datablocklist[[x]]))))
myMA <- matrix(NA, length(datablocklist), length(columns), dimnames=list(NULL, columns))
for(i in seq(along=datablocklist))
myMA[i, names(datablocklist[[i]])] <- datablocklist[[i]]
rownames(myMA) <- names(datablocklist)
return(myMA)
}
# Usage:
# blockmatrix <- datablock2ma(datablocklist=datablock(sdfset))
## (6.4.2) Split SDF tail matrix into character and numeric matrices
splitNumChar <- function(blockmatrix) {
# Define function to check for valid numeric values in a character vector
numberAble <- function(myvec, type=c("single", "vector"), extras = c(".", "NA")) {
type <- match.arg(type)
old <- options(warn = -1)
on.exit(options(old))
myvec <- gsub(" ", "", myvec)
myvecsub <- myvec[!myvec %in% c("", extras)]
isnum <- !any(is.na(as.numeric(myvecsub)))
if (type == "single")
isnum
else if (isnum)
as.numeric(myvec)
else myvec
}
colindex <- sapply(seq(along=blockmatrix[1,]), function(x) numberAble(blockmatrix[,x]))
numMA <- blockmatrix[ , colindex]
storage.mode(numMA) <- "numeric"
charMA <- blockmatrix[ , !colindex]
return(list(numMA=numMA, charMA=charMA))
}
# Usage:
# numchar <- splitNumChar(blockmatrix=blockmatrix)
#########################
## (6.5) Bond Matrices ##
#########################
## (6.5.1) Generate bond matrix from SDFset or SDF objects
conMA <- function(x, exclude="none") {
## Function for SDF object
.conMA <- function(x, exclude=exclude) {
atoms <- rownames(atomblock(x))
conma <- matrix(0, length(atoms), length(atoms), dimnames=list(atoms, atoms))
bondblock <- bondblock(x)
for(i in seq(along=bondblock[,1])) conma[bondblock[i,1], bondblock[i,2]] <- bondblock[i,3]
for(i in seq(along=bondblock[,1])) conma[bondblock[i,2], bondblock[i,1]] <- bondblock[i,3]
index <- !gsub("_.*", "", rownames(conma)) %in% exclude
conma <- conma[index, index, drop = FALSE]
return(conma)
}
## Run on SDF objects
if(class(x)=="SDF") {
conma <- .conMA(x, exclude)
return(conma)
}
## Run on SDFset objects containing one or many molecules
if(class(x)=="SDFset") {
conma_set <- lapply(seq(along=x), function(y) .conMA(x[[y]], exclude))
names(conma_set) <- cid(x)
return(conma_set)
}
}
# Usage:
# conma <- conMA(sdfset[1:2], exclude=c("H"))
## (6.5.2) Compute bond/charge count for each atom in SDFset or SDF objects
## This is used to add hydrogens with methods/functions atomcount, atomcountMA, MW and MF
bonds <- function(x, type="bonds") {
## Input checks
if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
if(!any(c("bonds", "charge", "addNH") %in% type)) stop("type can only be assigned: bonds, charge or addNH")
## Compute bonds, charges and missing hydrogens
.bonds <- function(x, type=type) {
atomMA <- atomblock(x)
atoms <- gsub("_.*", "", rownames(atomMA))
bondMA <- bondblock(x)
Nbonds1 <- cbind(atoms=c(bondMA[,1], bondMA[,2]), bonds=c(bondMA[,3], bondMA[,"C3"]))
Nbonds1 <- tapply(Nbonds1[, "bonds"], Nbonds1[, "atoms"], sum)
Nbonds <- rep(0, length(atomMA[,1])); names(Nbonds) <- seq(along=atomMA[,1]); Nbonds[names(Nbonds1)] <- Nbonds1
## Valence related to position in periodic table (following octet rule)
val <- c("1"=1, "17"=1, "2"=2, "16"=2, "13"=3, "15"=3, "14"=4)
group <- as.numeric(atomprop$Group); names(group) <- as.character(atomprop$Symbol)
Nbondrule <- val[as.character(group[atoms])]
Nbondrule[is.na(Nbondrule)] <- 0 # Atoms with undefined Nbondrule (NAs) are assigned zero
Nbondrule[Nbondrule < Nbonds] <- Nbonds[Nbondrule < Nbonds] # Set Nbondrule to Nbonds values, where latter is larger
## Charges
charge <- c("0"=0, "1"=3, "2"=2, "3"=1, "4"=0, "5"=-1, "6"=-2, "7"=-3) # 4 is "doublet_radical"
charge <- charge[as.character(atomMA[,5])]
Nbonds <- data.frame(atom=atoms, Nbondcount=Nbonds, Nbondrule=Nbondrule, charge=charge)
## Data type to return
if(type=="bonds") { return(Nbonds) }
if(type=="charge") {
chargeindex <- Nbonds[, "charge"] != 0
if(sum(chargeindex) == 0) {
return(NULL)
} else {
chargeDF <- Nbonds[chargeindex, ]
charge <- chargeDF[, "charge"]
names(charge) <- chargeDF[, "atom"]
return(charge)
}
}
if(type=="addNH") {
Nbonds[Nbonds[,"Nbondcount"] >= Nbonds[,"Nbondrule"], "charge"] <- 0 # Ignore charge where Nbondcount greater or equal than Nbondrule
Nbonds[Nbonds[,"Nbondcount"] == 0, c("Nbondrule", "charge")] <- 0 # Ignore atoms with zero bonds
NH <- sum((Nbonds[, "Nbondrule"] + Nbonds[, "charge"]) - Nbonds[, "Nbondcount"])
if(NH < 0) NH <- 0 # Count should not be negative
return(NH)
}
}
## Run on SDF objects
if(class(x)=="SDF") {
bonds <- .bonds(x, type)
return(bonds)
}
## Run on SDFset objects containing one or many molecules
if(class(x)=="SDFset") {
bonds_set <- lapply(seq(along=x), function(y) .bonds(x[[y]], type))
names(bonds_set) <- cid(x)
if(type=="bonds") {
return(bonds_set)
}
if(type=="charge") {
return(bonds_set)
}
if(type=="addNH") {
return(unlist(bonds_set))
}
}
}
# Usage:
# bondDF <- bonds(sdfset[1], type="df"))
# bondcount <- bonds(sdfset[1], type="addNH"))
##########################################################################
## (6.6) Subset SDF/SDFset Objects by Atom Index to Obtain Substructure ##
##########################################################################
## Function to obtain substructure from SDF/SDFset object by providing a row
## index for atom block. Both atom and bond blocks will be subsetted accordingly.
## Two functions are combined in one: type="new" assigns new atom numbers to
## the subsetted SDF, while type="old" maintains the numbering of the source SDF.
atomsubset <- function (x, atomrows, type="new", datablock = FALSE) {
## Variant that assigns new numbers to atoms in subsetted SDF
if(type=="new") {
if(!any(c("SDF", "SDFset") %in% class(x)))
stop("x needs to be of class SDF or SDFset")
if(class(x) == "SDFset" & class(atomrows) != "list")
stop("if x is of class SDFset, atomrows argument needs to be a list")
if(class(x) == "SDFset") {
if (!all(cid(x) == names(atomrows)))
stop("x and atomrows need to have same length and identical component (molecule) names")
}
.atomsubset <- function(x, atomrows) {
hb <- header(x)
ab <- atomblock(x)[atomrows, ]
bb <- bondblock(x)
index <- rowSums(cbind(bb[, 1] %in% atomrows, bb[, 2] %in%
atomrows)) == 2
bb <- bb[index, ]
pos <- as.numeric(gsub(".*_", "", rownames(ab)))
oripos <- 1:length(pos)
names(oripos) <- pos
tmp <- bb[, 1:2]
tmp2 <- tmp
for (i in oripos) tmp2[tmp == as.numeric(names(oripos[i]))] <- i
bb[, 1:2] <- tmp2
if (is.vector(bb)) {
bb <- t(as.matrix(bb))
}
countsLine <- hb[4]
atomCount <- sprintf("%3d", length(atomrows))
bondCount <- sprintf("%3d", length(rowSums(bb)))
# update atom count and bond count
hb[4] <- paste(atomCount, bondCount, substr(countsLine, 7, 100000L), sep="")
# update bond block row names
row.names(bb) <- 1:length(rowSums(bb))
# update atom block row names
row.names(ab) <- paste(gsub("_.*", "",rownames(ab)), 1:length(atomrows), sep="_")
if (datablock == FALSE) {
sdf <- new("SDF", header = hb, atomblock = ab, bondblock = bb)
return(sdf)
}
if (datablock == TRUE) {
sdf <- new("SDF", header = hb, atomblock = ab, bondblock = as.matrix(bb),
datablock = datablock(x))
return(sdf)
}
}
if (class(x) == "SDF") {
return(.atomsubset(x, atomrows))
}
if (class(x) == "SDFset") {
ids <- cid(x)
sdflist <- lapply(cid(x), function(y) atomsubset(sdfset[[y]],
atomrows[[y]]))
names(sdflist) <- ids
sdfset <- as(sdflist, "SDFset")
return(sdfset)
}
}
## Variant that maintains atom numbers from source SDF in subsetted SDF
if(type=="old") {
if(!any(c("SDF", "SDFset") %in% class(x))) stop("x needs to be of class SDF or SDFset")
if(class(x)=="SDFset" & class(atomrows)!="list") stop("if x is of class SDFset, atomrows argument needs to be a list")
if(class(x)=="SDFset") {
if(!all(cid(x) == names(atomrows))) stop("x and atomrows need to have same length and identical component (molecule) names")
}
.atomsubset <- function(x, atomrows) {
hb <- header(x)
ab <- atomblock(x)[atomrows, ]
bb <- bondblock(x)
index <- rowSums(cbind(bb[,1] %in% atomrows, bb[,2] %in% atomrows)) == 2
bb <- bb[index,]
## Update bb to positions in ab
pos <- as.numeric(gsub(".*_", "", rownames(ab)))
oripos <- 1:length(pos)
names(oripos) <- pos
tmp <- bb[,1:2]; tmp2 <- tmp
for(i in oripos) tmp2[tmp==as.numeric(names(oripos[i]))] <- i
bb[,1:2] <- tmp2
## Outputs
if(is.vector(bb)) { bb <- t(as.matrix(bb)) }
if(datablock==FALSE) {
sdf <- new("SDF", header=hb, atomblock=ab, bondblock=bb)
return(sdf)
}
if(datablock==TRUE) {
sdf <- new("SDF", header=hb, atomblock=ab, bondblock=as.matrix(bb), datablock=datablock(x))
return(sdf)
}
}
if(class(x)=="SDF") {
return(.atomsubset(x, atomrows))
}
if(class(x)=="SDFset") {
ids <- cid(x)
sdflist <- lapply(cid(x), function(y) atomsubset(sdfset[[y]], atomrows[[y]]))
names(sdflist) <- ids
sdfset <- as(sdflist, "SDFset")
return(sdfset)
}
}
}
## Usage:
# atomsubset(sdfset[[1]], atomrows=1:18, type="new")
# atomsubset(sdfset[1:2], atomrows=list(CMP1=1:18, CMP2=1:12), type="new")
###################################
## (6.7) Function write.SDFsplit ##
###################################
## Splits SD Files into any number of smaller SD Files
write.SDFsplit <- function(x, filetag, nmol) {
from <- seq(1, length(x), by=nmol)
splitDF <- data.frame(from=from, to=c(from[-1], length(x)+1)-1, filename=NA)
splitDF[,"filename"] <- paste(filetag, sprintf(paste("%0", nchar(as.character(length(x))), "d", sep=""), splitDF[,1]), "_",
sprintf(paste("%0", nchar(as.character(length(x))), "d", sep=""), splitDF[,2]), ".sdf", sep="")
for(i in seq(along=splitDF[,1])) {
write.SDF(x[splitDF[i,"from"]:splitDF[i,"to"]], splitDF[i,"filename"])
}
return(splitDF) # Gives access to file names to simplify import of split SD Files
}
## Usage
# write.SDFsplit(x=sdfstr, filetag="myfile", nmol=10)
# write.SDFsplit(x=sdfsample, filetag="myfile", nmol=10)
######################################
## (6.8) Streaming Through SD Files ##
######################################
## Streaming function to compute descriptors for large SD Files without consuming much memory.
## In addition to descriptor values, it returns a line index that defines the positions of each
## molecule in the source SD File. This line index can be used by the read.SDFindex function to
## retrieve specific compounds of interest from large SD Files without reading the entire file
## into memory.
sdfStream <- function(input, output, append=FALSE, fct, Nlines=10000, startline=1, restartNlines=10000, silent=FALSE, ...) {
## Define loop parameters
stop <- FALSE
f <- file(input, "r")
n <- Nlines
offset <- 0
## For restarting sdfStream at specific line assigned to startline argument. If assigned
## startline value does not match the first line of a molecule in the SD file then it
## will be reset to the start position of the next molecule in the SD file.
if(startline!=1) {
fmap <- file(input, "r")
shiftback <- 2
chunkmap <- scan(fmap, skip=startline-shiftback, nlines=restartNlines, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
startline <- startline + (which(grepl("^\\${4,4}", chunkmap, perl=TRUE))[1] + 1 - shiftback)
if(is.na(startline)) stop("Invalid value assigned to startline.")
dummy <- scan(f, skip=startline-2, nlines=1, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
close(fmap)
offset <- startline - 1 # Maintains abolut line positions in index
}
counter <- 0
cmpid <- 1
partial <- NULL
while(!stop) {
counter <- counter + 1
chunk <- readLines(f, n = n) # chunk n can be any number of lines
# chunk <- scan(f, n=n, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n") # scan has more flexibilities for reading specific line ranges in files.
if(length(chunk) > 0) {
if(length(partial) > 0) {
chunk <- c(partial, chunk)
}
## Assure that lines of least 2 complete molecules are stored in chunk if available
inner <- sum(grepl("^\\${4,4}", chunk, perl=TRUE)) < 2
while(inner) {
chunklength <- length(chunk)
chunk <- c(chunk, readLines(f, n = n))
if(chunklength == length(chunk)) {
inner <- FALSE
} else {
inner <- sum(grepl("^\\${4,4}", chunk, perl=TRUE)) < 2
}
}
y <- regexpr("^\\${4,4}", chunk, perl=TRUE) # identifies all fields that start with a '$$$$' sign
index <- which(y!=-1)
indexDF <- data.frame(start=c(1, index[-length(index)]+1), end=index)
complete <- chunk[1:index[length(index)]]
if((index[length(index)]+1) <= length(chunk)) {
partial <- chunk[(index[length(index)]+1):length(chunk)]
} else {
partial <- NULL
}
index <- index + offset
indexDF <- data.frame(SDFlineStart=c(offset + 1, index[-length(index)]+1), SDFlineEnd=index)
offset <- indexDF[length(indexDF[,2]),2]
## Coerce file lines stored in character vector to SDFset
sdfset <- read.SDFset(read.SDFstr(complete))
valid <- validSDF(sdfset)
sdfset <- sdfset[valid]
indexDForig <- indexDF
indexDF <- indexDF[valid,]
## Perform desired computation on SDFset
if(length(indexDF[,1])==1) {
suppressWarnings(sdfset <- c(sdfset, sdfset)) # Trick to keep data in matrix format
resultMA <- fct(sdfset, ...)
resultMA <- cbind(as.data.frame(indexDF), as.data.frame(resultMA[1, , drop=FALSE]), row.names=row.names(resultMA)[1])
} else {
resultMA <- fct(sdfset, ...)
resultMA <- cbind(as.data.frame(indexDF), as.data.frame(resultMA), row.names=row.names(resultMA))
}
resultMA <- resultMA[names(valid),]
if(any(is.na(resultMA))) { # Maintains index for invalid compounds having NAs in descriptor fields
resultMA[,1:2] <- indexDForig[,1:2]
}
rownames(resultMA) <- paste("CMP", cmpid : (cmpid + length(resultMA[,1])-1), sep="")
cmpid <- cmpid + length(resultMA[,1])
## Print processing status to screen
if(silent==FALSE) {
print(rownames(resultMA))
}
## Append results to tabular file
if(counter==1 & append!=TRUE) {
unlink(output)
write.table(resultMA, output, quote=FALSE, col.names=NA, sep="\t")
} else {
write.table(resultMA, output, quote=FALSE, append=TRUE, col.names=FALSE, sep="\t")
}
}
if(length(chunk) == 0) {
stop <- TRUE
close(f)
}
}
}
## Usage:
# library(ChemmineR)
# data(sdfsample); sdfset <- sdfsample
# write.SDF(sdfset, "test.sdf")
## Choose descriptor set in a simple function:
# desc <- function(sdfset) {
# cbind(SDFID=sdfid(sdfset),
# # datablock2ma(datablocklist=datablock(sdfset)),
# MW=MW(sdfset),
# groups(sdfset),
# # AP=sdf2ap(sdfset, type="character"),
# rings(sdfset, type="count", upper=6, arom=TRUE)
# )
# }
# sdfStream(input="test.sdf", output="matrix.xls", fct=desc, Nlines=1000)
# indexDF <- read.delim("matrix.xls", row.names=1)[,1:2]
# sdfset <- read.SDFindex(file="test.sdf", index=indexDF, type="SDFset", outfile="sub.sdf")
#####################################################################
## (6.9) Read Atom Pair String Representation from File into APset ##
#####################################################################
## Function to convert atom pairs (AP) or atom pair fingerprints (APFP) stored
## as character strings to APset or FPset objects (e.g. generated by sdfStream).
## Alternatively, one can provide the AP/APFP strings in a named character vector.
read.AP <- function(x, type, colid,isFile = class(x)=="character" & length(x)==1) {
if(type=="ap") {
if(isFile) {
x <- read.delim(x, sep="\t", row.names=1)
ids <- row.names(x); x <- x[,colid]; names(x) <- ids
}
desclist <- strsplit(as.character(x), ", ", fixed = TRUE)
desclist <- lapply(desclist, as.numeric)
names(desclist) <- names(x)
return(as(desclist, "APset"))
}
if(type=="apfp" | type=="fp") {
if(isFile) {
x <- read.delim(x, colClasses="character", sep="\t", row.names=1)
ids <- row.names(x); x <- x[,colid]; names(x) <- ids
}
descma <- matrix(as.numeric(unlist(strsplit(x, ""))), length(x), nchar(x[1]), byrow=TRUE, dimnames=list(names(x), 1:nchar(x[1])))
#return(as(descma, "FPset"))
return(new( "FPset",fpma=descma,type="apfp"))
}
}
## Usage:
# apset <- read.AP(x="matrix.xls", type="ap", colid="AP")
# apfp <- read.AP(x="matrix.xls", type="apfp", colid="APFP")
#########################################################
## (6.10) Extract Molecules from SD File by Line Index ##
#########################################################
## Extracts specific molecules from SD File based on a line position index computed by the sdfStream function
read.SDFindex <- function(file, index, type="SDFset", outfile) {
f <- file(file, "r")
if(type=="SDFset") {
sdfset <- SDFset()
}
## Index transfromations
index <- index[order(index[,1]),] # Assures positional sorting, which is expected by the following step!!
index <- data.frame(skip=index[,1] - c(0, index[-length(index[,1]),2]) - 1 , nlines=index[,2] - index[,1] + 1)
## Scan through file using index to retrieve molecules one-by-one
for(i in seq(along=index[,1])) {
lines <- scan(f, skip=index[i,1], nlines=index[i,2], what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
#delteme# lines <- scan(file, skip=index[i,1]-1, nlines=index[i,2]-index[i,1] + 1, what="a", blank.lines.skip=FALSE, quiet=TRUE, sep ="\n")
if(type=="file") {
if(i == 1) {
unlink(outfile)
cat(lines, file=outfile, sep="\n")
} else {
cat(lines, file=outfile, sep="\n", append=TRUE)
}
}
if(type=="SDFset") {
suppressWarnings(sdfset <- c(sdfset, read.SDFset(read.SDFstr(lines))))
}
}
close(f)
if(type=="SDFset") {
cid(sdfset) <- paste("CMP", 1:length(sdfset), sep="")
return(sdfset)
}
}
## Usage: see sdfStream()
#################################
## (6.11) String Search Method ##
#################################
## String search function for SDFset
grepSDFset <- function(pattern, x, field="datablock", mode="subset", ignore.case=TRUE, ...) {
## Generate search vector and index for desired field in SDFset
if(field=="header" | field==1) {
searchfield <- header(x)
searchstr <- as.character(unlist(searchfield))
index <- sapply(searchfield, length)
index <- unlist(sapply(index, function(x) seq(1, x)))
indexnames <- rep(seq(along=searchfield), sapply(searchfield, length))
names(index) <- indexnames
}
if(field=="atomblock" | field==2) {
searchfield <- atomblock(x)
searchstr <- as.character(unlist(sapply(searchfield, rownames)))
searchstr <- gsub("_.*", "", searchstr)
index <- sapply(searchfield, function(x) length(x[,1]))
index <- unlist(sapply(index, function(x) seq(1, x)))
indexnames <- rep(seq(along=searchfield), sapply(searchfield, function(x) length(x[,1])))
names(index) <- indexnames
}
if(field=="bondblock" | field==3) { # Not very useful for string searching.
searchfield <- bondblock(x)
searchstr <- as.character(unlist(sapply(searchfield, rownames)))
searchstr <- gsub("_.*", "", searchstr)
index <- sapply(searchfield, function(x) length(x[,1]))
index <- unlist(sapply(index, function(x) seq(1, x)))
indexnames <- rep(seq(along=searchfield), sapply(searchfield, function(x) length(x[,1])))
names(index) <- indexnames
}
if(field=="datablock" | field==4) {
searchfield <- datablock(x)
searchstr <- paste(names(unlist(searchfield)), as.character(unlist(searchfield)), sep="___")
index <- sapply(searchfield, length)
index <- unlist(sapply(index, function(x) seq(1, x)))
indexnames <- rep(seq(along=searchfield), sapply(searchfield, length))
names(index) <- indexnames
}
## Search with grep
matchpos <- grep(pattern, searchstr, ignore.case=ignore.case, ...)
if(mode=="index") {
return(index[matchpos])
}
if(mode=="subset") {
xpos <- as.numeric(unique(names(index[matchpos])))
return(as(x[xpos], "SDF"))
}
}
##########################
## (7) Plotting Methods ##
##########################
## Plot single CMP Structure
plotStruc <- function(sdf,
atomcex=1.2,
atomnum=FALSE,
no_print_atoms=c("C"),
noHbonds=TRUE,
bondspacer=0.12,
colbonds=NULL,
bondcol="red",
regenCoords=FALSE,
...) {
if(regenCoords && .haveOB())
sdf = regenCoordsOB(sdf)
toplot <- list(atomblock=cbind(atomblock(sdf)[,c(1:2)], as.matrix(bonds(sdf, type="bonds")[,-1])), bondblock=cbind(as.matrix(as.data.frame(bondblock(sdf))[,1:3]), bondcol=1))
## Add bond color
toplot[[2]][, "bondcol"] <- toplot[[2]][,"bondcol"] + as.numeric((toplot[[2]][,"C1"] %in% colbonds) & (toplot[[2]][,"C2"] %in% colbonds))
## Create empty plot with proper dimensions
plot(toplot[[1]], type="n", axes=F, xlab="", ylab="", ...)
## Remove C-hydrogens including their bonds
if(noHbonds==TRUE) {
nonbonded <- !1:length(toplot[[1]][,1]) %in% sort(unique(as.vector(toplot[[2]][,1:2])))
nonbonded <- as.data.frame(toplot[[1]])[nonbonded,]
CHbondindex <- sapply(seq(toplot[[2]][,1]), function(x) paste(sort(gsub("_.*", "", rownames(toplot[[1]]))[toplot[[2]][x,1:2]]), collapse="") == "CH")
toplot[[1]] <- toplot[[1]][sort(unique(as.numeric(toplot[[2]][!CHbondindex,1:2]))), ]
toplot[[2]] <- as.matrix(as.data.frame(toplot[[2]])[!CHbondindex,])
toplot[[1]] <- as.matrix(rbind(toplot[[1]], nonbonded))
}
## Plot bonds
z <- toplot[[2]][, "bondcol"]; z[z==2] <- bondcol # Stores bond coloring data
for(i in seq(along=toplot[[2]][,1])) {
x <- toplot[[1]][gsub("*.*_", "", rownames(toplot[[1]])) %in% toplot[[2]][i,1:2],1]
y <- toplot[[1]][gsub("*.*_", "", rownames(toplot[[1]])) %in% toplot[[2]][i,1:2],2]
## Plot single bonds
if(toplot[[2]][i,3]==1) {
lines(x=x, y=y, lty=1, lwd=3, col=z[i])
}
## Plot double bonds
if(toplot[[2]][i,3]==2) {
rslope <- (atan(diff(y)/diff(x))*180/pi)/90
lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
}
## Plot triple bonds
if(toplot[[2]][i,3]==3) {
rslope <- (atan(diff(y)/diff(x))*180/pi)/90
bondspacer <- bondspacer * 2
lines(x=x, y=y, lty=1, lwd=3, col=z[i])
lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=1, lwd=3, col=z[i])
}
## Plot bonds with labels other than 1-3 (e.g. some ChEMBL SDFs use 4 for aromatic bonds)
if(!toplot[[2]][i,3] %in% c(1,2,3)) {
rslope <- (atan(diff(y)/diff(x))*180/pi)/90
lines(x=x-rslope*bondspacer, y=y+(1-abs(rslope))*bondspacer, lty=3, lwd=3, col=z[i])
lines(x=x+rslope*bondspacer, y=y-(1-abs(rslope))*bondspacer, lty=3, lwd=3, col=z[i])
}
}
## Exclude certain atoms from being printed
exclude <- paste("(^", no_print_atoms, "_)", sep="", collapse="|")
labelMA <- toplot[[1]][!grepl(exclude, rownames(toplot[[1]])), , drop=FALSE] # Added July 31, 2012: 'drop=FALSE'
## Add charges
charge <- c("0"="", "3"="3+", "2"="2+", "1"="+", "-1"="-", "-2"="2-", "-3"="3-")
charge <- charge[as.character(labelMA[,"charge"])]
## Add hydrogens to non-charged/non-C atoms according to valence rules (some SD files require this)
Nhydrogens <- c("0"="", "1"="H", "2"="H2", "3"="H3", "4"="H4", "5"="H5", "6"="H6", "7"="H7", "8"="H8")
hydrogens <- (labelMA[, "Nbondrule"] + labelMA[, "charge"]) - labelMA[,"Nbondcount"]
hydrogens[labelMA[,"charge"]!=0] <- 0; hydrogens[hydrogens < 0] <- 0
hydrogens <- Nhydrogens[as.character(hydrogens)]
## Plot data
if(is.vector(labelMA)) labelMA <- matrix(labelMA, 1, 2, byrow=TRUE, dimnames=list(rownames(toplot[[1]])[!grepl(exclude, rownames(toplot[[1]]))], c("C1", "C2")))
if(is.matrix(labelMA) & length(labelMA[,1])>=1) {
atomcol <- gsub("_.*", "", rownames(labelMA)); atomcol[!grepl("N|C|O|H", atomcol)] <- "any"; mycol <- c(C="black", H="black", N="blue", O="red", any="green"); atomcol <- mycol[atomcol]
## Overplot nodes to display atom labels
points(x=labelMA[,1], y=labelMA[,2], col="white", pch=16, cex=2.8)
## Plot atom labels
if(atomnum==TRUE) {
text(x=labelMA[,1], y=labelMA[,2], paste(gsub("_", "", rownames(labelMA)), hydrogens, charge, sep=""), cex=atomcex, col=atomcol)
} else {
text(x=labelMA[,1], y=labelMA[,2], paste(gsub("_.*", "", rownames(labelMA)), hydrogens, charge, sep=""), cex=atomcex, col=atomcol)
}
}
}
## Usage:
# plotStruc(sdf=sdfset[[2]], atomcex=1.2, atomnum=F, no_print_atoms=c("C"), noHbonds=TRUE, bondspacer=0.08)
# par(mfrow=c(2,3)); for(i in 1:6) plotStruc(sdf=sdfset[[i]], atomcex=1.8, atomnum=F, no_print_atoms=c("C"), noHbonds=TRUE, bondspacer=0.08)
# disabled for now as rsvg is not available on bioC
openBabelPlot=function(sdfset,
height=600,
noHbonds=TRUE,
regenCoords=FALSE
){
.ensureOB()
tempF = tempfile()
sdf2image(sdfset,tempF,format="SVG",height=height,noHbonds,regenCoords,outputOptions=c("t"))
img = rsvg(tempF,height,height)
file.remove(tempF)
plot(c(0,100),c(0,100), type="n", axes=F, xlab="", ylab="")
rasterImage(img,0,0,100,100)
}
## Plot method for single SDF object
setMethod(f="plot", signature="SDF", definition=function(x, print=TRUE, ...) {
plotStruc(sdf=x, ...)
if(print==TRUE) { return(x) }
}
)
## Plot method for multiple SDF objects in SDFset
setMethod(f="plot", signature="SDFset",
definition=function(x, griddim, print_cid=cid(x), print=TRUE, ...) {
if(missing(griddim)) {
mydim <- ceiling(sqrt(length(x)))
griddim <- c(mydim, mydim)
}
par(mfrow=griddim)
for(i in 1:length(x)) { plotStruc(sdf=x[[i]], main=print_cid[i], ...) }
if(print==TRUE) { return(SDFset2SDF(x)) }
})
## Object for referencing chemminetools jobs
setClass("jobToken", representation=representation(
tool_name = "character",
jobId = "character"
))
## Show method for checking status of a submitted chemminetools job
setMethod("show", signature=signature(
object="jobToken"),
function(object){
response <- status(object)
cat("tool name:\t", slot(object, "tool_name"), "\n")
cat("status:\t\t", response, "\n")
}
)
cstrsplit <- function(line) .Call(cstrsplitSym,line)
#cstrsplit <- cxxfunction(signature(l="character"),includes='
# #include <R.h>
# #include <boost/algorithm/string.hpp>
# ',body='
# std::vector<std::string> strs;
# const char *line = CHAR(STRING_ELT(l,0));
# boost::split(strs, line, boost::is_any_of("\t "),boost::token_compress_on);
#
# he
# CharacterVector output2(strs.begin(),strs.end());
# return output2;
#
# ',plugin="Rcpp")
SDFDataTable <- function(sdfset) {
data = cbind( lapply(seq(along=sdfset),function(i){
tempF = tempfile()
sdf2image(sdfset[i],tempF,format="SVG",height=200,TRUE,TRUE,outputOptions = c('d'))
encoding = base64encode(tempF)
unlink(tempF)
encoding
} ), datablock2ma(datablock(sdfset)))
colnames(data)[1] = "Image"
datatable(data,
extensions= c('Buttons','FixedColumns','FixedHeader'),
options = list(
dom="Bfrtipl",
buttons = c("colvis","csv"),
scrollX=TRUE,
fixedColumns= list(leftColumns=2),
#fixedHeader = TRUE,
pageLength = 5,
columnDefs = list(list(
targets = 1,
data = "image",
render = JS(
"function(data,type,row,meta){",
" return \"<img = height=100 width=100 src='data:image/svg+xml;base64,\"+row[1]+\"'>\";",
"}")
))))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.