### THIS LIB is COPIED FROM http://labs.genetics.ucla.edu/horvath/RFclustering/RFclustering.htm
###############
# LIBRARIES #
###############
library(MASS)
library(cluster)
library(survival)
library(randomForest)
library(Hmisc)
#install.path <- dirname(sys.frame(1)$ofile)
#########################################################################################
#########################################################################################
if (exists("Rand") ) rm(Rand)
Rand <- function(tab,adjust=T) {
##########################################################################
# The function computes the (adjusted) Rand index between two partitions #
# Copyright Steve Horvath and Luohua Jiang, UCLA, 2003 #
##########################################################################
# helper function
choosenew <- function(n,k) {
n <- c(n); out1 <- rep(0,length(n));
for (i in c(1:length(n)) ){
if ( n[i]<k ) {out1[i] <- 0}
else {out1[i] <- choose(n[i],k) }
}
out1
}
a <- 0; b <- 0; c <- 0; d <- 0; nn <- 0
n <- nrow(tab)
for (i in 1:n) {
for(j in 1:n) {
a <- a+choosenew(tab[i,j],2)
nj <- sum(tab[,j])
c <- c+choosenew(nj,2)
}
ni <- sum(tab[i,])
b <- b+choosenew(ni,2)
nn <- nn+ni
}
if(adjust==T) {
d <- choosenew(nn,2)
adrand <- (a-(b*c/n)/d)/(0.5*(b+c/n)-(b*c/n)/d)
adrand
} else {
b <- b-a
c <- c/n-a
d <- choosenew(nn,2)-a-b-c
rand <- (a+d)/(a+b+c+d)
rand
}
}
#########################################################################################
#########################################################################################
if (exists("pamNew") ) rm(pamNew)
pamNew <- function (x, k, diss1 = inherits(x, "dist"), metric1 = "euclidean")
{
#############################################################################################################
# A new pam clustering function which corrects the clustering membership based on the sillhouette strength. #
# The clustering membership of an observation with a negative sillhouette strength is reassigned to its #
# neighboring cluster. #
# The inputs of the function are similar to the original 'pam' function. #
# The function returns a vector of clustering labels. #
# Copyright 2003 Tao Shi and Steve Horvath (last modified 10/31/03) #
#############################################################################################################
if (diss1)
{
if (!is.null(attr(x, "Labels"))) { original.row.names <- attr(x, "Labels")}
names(x) <- as.character(c(1:attr(x, "Size")))
}
else
{
if(!is.null(dimnames(x)[[1]])) { original.row.names <- dimnames(x)[[1]]}
row.names(x) <- as.character(c(1:dim(x)[[1]]))
}
pam1 <- pam(x,k,diss=diss1, metric=metric1)
label2 <- pam1$clustering
silinfo1 <- pam1$silinfo$widths
index1 <- as.numeric(as.character(row.names(silinfo1)))
silinfo2 <- silinfo1[order(index1),]
labelnew <- ifelse(silinfo2[,3]<0, silinfo2[,2], silinfo2[,1])
names(labelnew) <- original.row.names
labelnew
}
###############################################################################################
###############################################################################################
if (exists("collect.garbage") ) rm(collect.garbage)
collect.garbage <- function(){
## The following function collects garbage until the memory is clean.
## Usage: 1. immediately call this function after you call a function or
## 2. rm()
while (gc()[2,4] != gc()[2,4]){}
}
###############################################################################################
###############################################################################################
if (exists("RFdist") ) rm(RFdist)
RFdist <- function(Rf.data, datRF, imp=T, no.tree, proxConver=F) {
####################################################################
# Unsupervised randomForest function #
# Return a list "distRF" containing some of the following 6 fields #
# depending on the options specified: #
# (1) cl1: addcl1 distance (sqrt(1-RF.proxAddcl1)) #
# (2) err1: error rate #
# (3) imp1: variable importance for addcl1 #
# (4) prox1Conver: a matrix containing two convergence meausres #
# for addcl1 proximity #
# a). max( abs( c(aveprox(N))-c(aveprox(N-1)))) #
# b). mean((c(aveprox(N))-c(aveprox(N-1)))^2) #
# where N is number of forests (no.rep). #
# (5) cl2, (6) err2, (7)imp2 and (8) prox2Conver for addcl2 #
# Copyright Steve Horvath and Tao Shi (2004) #
####################################################################
cleandist <- function(x) {
x1 <- as.dist(x)
x1[x1<=0] <- 0.0000000001
as.matrix(x1)
}
no.rep <- length(Rf.data)
nrow1 <- dim(datRF)[[1]]
ncol1 <- dim(datRF)[[2]]
RFproxAddcl1 <- matrix(0,nrow=nrow1,ncol=nrow1)
RFprox1Conver <- cbind(1:no.rep,matrix(0,(no.rep),3))
RFimportance1 <- matrix(0, nrow=ncol1, ncol=4)
RFerrrate1 <- 0
rep1 <- rep(666,2*nrow1)
for (i in 1:length(Rf.data) ) {
i = i -1
index1 <- Rf.data[[i+1]]$index1
yy <- Rf.data[[i+1]]$yy
importance <- Rf.data[[i+1]]$importance
err.rate <- Rf.data[[i+1]]$err.rate
RF1prox <- Rf.data[[i+1]]$RF1prox
if (i > 0) {
if (i > 1){
xx <- ((RFproxAddcl1 + (RF1prox[c(1:nrow1),c(1:nrow1)]))/i) - (RFproxAddcl1/(i-1))
yy <- mean( c(as.dist((RFproxAddcl1 + (RF1prox[c(1:nrow1),c(1:nrow1)]))/i)))
RFprox1Conver[i,2] <- max(abs(c(as.dist(xx))))
RFprox1Conver[i,3] <- mean((c(as.dist(xx)))^2)
RFprox1Conver[i,4] <- yy
}
RFproxAddcl1 <- RFproxAddcl1 + (RF1prox[c(1:nrow1),c(1:nrow1)])
if(imp) { RFimportance1 <- RFimportance1+ 1/no.rep*(importance) }
RFerrrate1 <- RFerrrate1+ 1/no.rep*(err.rate[no.tree])
}
}
distRFAddcl1 <- cleandist(sqrt(1-RFproxAddcl1/no.rep))
distRF <- list(cl1=NULL, err1=NULL, imp1=NULL, prox1Conver=NULL,
cl2=NULL, err2=NULL, imp2=NULL, prox2Conver=NULL)
distRF$cl1 <- distRFAddcl1
distRF$err1 <- RFerrrate1
if(imp) distRF$imp1 <- RFimportance1
if(proxConver) distRF$prox1Conver <- RFprox1Conver
distRF
}
set_lock <- function ( filename ) {
system ( paste('touch ',filename,'.lock', sep='') )
}
release_lock <- function ( filename ) {
system ( paste('rm ',filename,'.lock', sep='') )
}
locked <- function ( filename ) {
ret = TRUE
if (file.exists( filename )){
ret <- file.exists( paste(filename,'.lock', sep='') )
}
ret
}
read_RF <- function ( files=c(''), max.wait = 20 ) {
returnRF <- NULL
waited = 0
Sys.sleep(20) ## sleep 20 sec
read <- 0
while ( read < length(files) ){
if (waited /3 >= max.wait ) {
stop ( "Sorry the other processes did not finish in time" )
}
if (locked( files[1]) ) {
print ( paste ( "wating for files to unlock! ( n =",waited,")"))
}
else {
for ( i in 1:length(files) ) {
if ( ! locked( files[i]) ) {
if ( i == 1 ){
load(files[i])
returnRF <- Rf.data
read = read +1
}
else {
load(files[i])
a <- 1 + length(returnRF)
for ( z in 1:length(Rf.data)){
returnRF[[a]] <- Rf.data[[z]]
a = a +1
}
read = read +1
}
}
}
}
}
returnRF
}
save_RF <- function ( Rf.data , fname ) {
save( Rf.data, file=fname )
}
calculate_RF <- function (datRF = NULL, mtry1=3, no.rep= 20, no.tree= 500, addcl1=TRUE, addcl2=FALSE, imp=T, oob.prox1=T, max.syn=100, file) {
synthetic1 <- function(dat, syn.n=NULL) {
sample1 <- function(X) { sample(X, replace=T) }
g1 <- function(dat) { apply(dat,2,sample1) }
nrow1 <- dim(dat)[[1]]
yy <- rep(c(1,2),c(nrow1,nrow1) )
data.frame(cbind(yy,rbind(dat,data.frame(g1(dat)))))
}
synthetic1.1 <- function(dat) {
sample1 <- function(X) { sample(X, replace=T) }
g1 <- function(dat) { apply(dat,2,sample1) }
nrow1 <- dim(dat)[[1]]
syn.n <- nrow1
if ( syn.n > 50 ) {
syn.n = 50
}
yy <- rep(c(1,2),c(nrow1,syn.n) )
data.frame(cbind(yy,rbind(dat,data.frame(g1(dat)[1:syn.n,]))))
}
synthetic2 <- function(dat) {
sample2 <- function(X) { runif(length(X), min=min(X), max =max(X)) }
g2 <- function(dat) { apply(dat,2,sample2) }
nrow1 <- dim(dat)[[1]];
yy <- rep(c(1,2),c(nrow1,nrow1) );
data.frame(cbind(yy,rbind(dat,data.frame(g2(dat)))))
}
cleandist <- function(x) {
x1 <- as.dist(x)
x1[x1<=0] <- 0.0000000001
as.matrix(x1)
}
Rf.data <- vector('list', no.rep +1)
syn.n <- nrow1 <- dim(datRF)[[1]]
if ( syn.n > max.syn ) {
syn.n = max.syn
}
ncol1 <- dim(datRF)[[2]]
rep1 <- rep(-1,nrow1+syn.n)
if ( addcl1 && addcl2 ){
stop( "Sorry you can not get an addc1 AND add2 distribution on the same time!")
}
if (addcl1) {
for (i in c(0:no.rep)) {
print ( paste( "forest", i) )
index1 <- sample(c(1:(nrow1+syn.n)))
rep1[index1] <- c(1:(nrow1+syn.n))
datRFsyn <- synthetic1(datRF,syn.n)[index1,]
yy <- datRFsyn[,1]
RF1 <- randomForest(factor(yy)~.,data=datRFsyn[,-1], ntree=no.tree, oob.prox=oob.prox1, proximity=TRUE,do.trace=F,mtry=mtry1,importance=imp)
collect.garbage()
RF1prox <- RF1$proximity[rep1,rep1]
Rf.data = list( A = list(index1=index1, yy=yy, RF1prox=RF1prox, importance =RF1$importance, err.rate=RF1$err.rate ) )
save(Rf.data, file=paste(file,i,".Rdata", sep='' ) )
}
}
if (addcl2) {
for (i in c(0:no.rep)) {
print ( paste( "forest", i) )
index1 <- sample(c(1:(2*nrow1)))
rep1[index1] <- c(1:(2*nrow1))
datRFsyn <- synthetic2(datRF)[index1,]
yy <- datRFsyn[,1]
RF1 <- randomForest(factor(yy)~.,data=datRFsyn[,-1], ntree=no.tree, oob.prox=oob.prox1, proximity=TRUE,do.trace=F,mtry=mtry1,importance=imp)
collect.garbage()
RF1prox <- RF1$proximity[rep1,rep1]
Rf.data = list( A = list(index1=index1, yy=yy, RF1prox=RF1prox, importance =RF1$importance, err.rate=RF1$err.rate ) )
save(Rf.data, file=paste(file,i,".Rdata", sep='' ) )
}
}
Rf.data
}
prepareRFcalc <- function( x, trees=433, forests=7, pocs=32, opath, debug=F ) {
UseMethod('prepareRFcalc', x)
}
prepareRFcalc.StefansExpressionSet <- function( x, trees=433, forrests=7, pocs=32, opath, debug=F ) {
print ( install.path )
datRF <- x$data
save( datRF, file=paste(opath,'datRF.RData',sep="/" ) )
if ( debug) {
debug = ' -debug '
}
else {
debug =''
}
system ( paste( "perl ", install.path,"/../scripts/CreateRandomForestScripts.pl -infile ",
paste(opath,'datRF.RData',sep="/" ), " -splits ",pocs, " -trees ", trees,
" -forests ",forests , " -outfile ", paste(opath,"RandomForestTransfereData.tar.gz", sep='/'), sep='' ) )
paste( "Please copy the file",paste(opath,"RandomForestTransfereData.tar.gz", debug, sep='/'),
"to the calculation server and run the script 'submit_RF_to_qsub.pl -files randomForest_worker_*.R'")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.