####
## Adapted version of https://github.com/cran/fpc
####
clujaccard <- function(c1,c2,zerobyzero=NA){
# print(c1)
# print(c2)
if (sum(c1)+sum(c2)-sum(c1 & c2)==0) out <- zerobyzero
else
out <- sum(c1 & c2)/(sum(c1)+sum(c2)-sum(c1 & c2))
out
}
noisemclustCBI <- function(data, G=NULL, k=NULL, modelNames=NULL,
nnk=0, hcmodel = NULL,
Vinv = NULL, summary.out=FALSE, ...){
# require(mclust)
# require(prabclus)
if (!is.null(k)) G <- k
data <- as.matrix(data)
# print(str(data))
if (nnk > 0) {
noise <- as.logical(1 - NNclean(data, nnk)$z)
# print(noise)
if (!is.null(hcmodel))
hcPairs <- hc(modelName = hcmodel, data = data)
if (is.null(Vinv) & is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames, initialization=list(noise=noise),...)
if (!is.null(Vinv) & is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames, initialization=list(noise=noise),
Vinv=Vinv,...)
if (is.null(Vinv) & !is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs, noise=noise),...)
if (!is.null(Vinv) & !is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs, noise=noise),
Vinv=Vinv,...)
}
else {
if (!is.null(hcmodel)) {
hcPairs <- hc(modelName = hcmodel, data = data)
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs),
Vinv=Vinv,...)
}
else
c1 <- mclustBIC(data, G, modelNames,
Vinv=Vinv,...)
noise <- rep(0, nrow(data))
}
# print(c1)
sc1 <- summary(c1,data)
# print(str(sc1))
sc1c <- sc1$classification
cl <- list()
if (sc1$G==0)
sc1c <- rep(0,nrow(data))
nc <- nccl <- max(sc1c)
if (sum(sc1c==0)>0){
nc <- nccl+1
sc1c[sc1c==0] <- nc
}
for (i in 1:nc)
cl[[i]] <- sc1c == i
if (summary.out)
out <- list(result = c1, nc = nc, nccl = nccl, clusterlist = cl,
partition = sc1c, nnk = nnk, initnoise = as.logical(noise),
mclustsummary=sc1,
clustermethod = "mclustBIC")
else
out <- list(result = c1, nc = nc, nccl = nccl, clusterlist = cl,
partition = sc1c, nnk = nnk, initnoise = as.logical(noise),
clustermethod = "mclustBIC")
out
}
distnoisemclustCBI <- function(dmatrix, G=NULL, k=NULL, modelNames=NULL,
nnk=0, hcmodel = NULL,
Vinv = NULL, mdsmethod="classical",
mdsdim=4, summary.out=FALSE,
points.out=FALSE, ...){
dmatrix <- as.matrix(dmatrix)
if (!is.null(k)) G <- k
n <- ncol(dmatrix)
# require(MASS)
# require(prabclus)
# require(mclust)
if (mdsmethod != "classical") {
mindm <- min(dmatrix[dmatrix > 0])/10
dmatrix[dmatrix<mindm] <- mindm
}
data <- switch(mdsmethod, classical = cmdscale(dmatrix, k = mdsdim),
kruskal = isoMDS(dmatrix, k = mdsdim)$points, sammon =
sammon(dmatrix, k = mdsdim)$points)
data <- as.matrix(data)
if (nnk > 0) {
noise <- as.logical(1 - NNclean(data, nnk)$z)
# print(noise)
if (!is.null(hcmodel))
hcPairs <- hc(modelName = hcmodel, data = data)
if (is.null(Vinv) & is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames, initialization=list(noise=noise),...)
if (!is.null(Vinv) & is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames, initialization=list(noise=noise),
Vinv=Vinv,...)
if (is.null(Vinv) & !is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs, noise=noise),...)
if (!is.null(Vinv) & !is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs, noise=noise),
Vinv=Vinv,...)
}
else {
if (!is.null(hcmodel)) {
hcPairs <- hc(modelName = hcmodel, data = data)
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs),
Vinv=Vinv,...)
}
else
c1 <- mclustBIC(data, G, modelNames,
Vinv=Vinv,...)
noise <- rep(0, nrow(data))
}
# print(c1)
sc1 <- summary(c1,data)
# print(str(sc1))
sc1c <- sc1$classification
cl <- list()
if (sc1$G==0)
sc1c <- rep(0,nrow(data))
nc <- nccl <- max(sc1c)
if (sum(sc1c==0)>0){
nc <- nccl+1
sc1c[sc1c==0] <- nc
}
for (i in 1:nc)
cl[[i]] <- sc1c == i
out <- list(result = c1, nc = nc, nccl = nccl, clusterlist = cl,
partition = sc1c, nnk = nnk, initnoise = as.logical(noise),
clustermethod = "mclustBIC")
if (summary.out)
out$mclustsummary <- sc1
if (points.out)
out$points <- data
out
}
mergenormCBI <- function(data, G=NULL, k=NULL, modelNames=NULL, nnk=0,
hcmodel = NULL,
Vinv = NULL, mergemethod="bhat",
cutoff=0.1,...){
# require(mclust)
# require(prabclus)
if (!is.null(k)) G <- k
if (nnk > 0) {
noise <- as.logical(1 - NNclean(data, nnk)$z)
# print(noise)
if (!is.null(hcmodel))
hcPairs <- hc(modelName = hcmodel, data = data)
if (is.null(Vinv) & is.null(hcmodel)){
c1 <- mclustBIC(data, G, modelNames, initialization=list(noise=noise),...)
# print("mclust done")
}
if (!is.null(Vinv) & is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames, initialization=list(noise=noise),
Vinv=Vinv,...)
if (is.null(Vinv) & !is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs, noise=noise),...)
if (!is.null(Vinv) & !is.null(hcmodel))
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs, noise=noise),
Vinv=Vinv,...)
}
else {
if (!is.null(hcmodel)) {
hcPairs <- hc(modelName = hcmodel, data = data)
c1 <- mclustBIC(data, G, modelNames,
initialization=list(hcPairs=hcPairs),
Vinv=Vinv,...)
}
else
c1 <- mclustBIC(data, G, modelNames,
Vinv=Vinv,...)
noise <- rep(0, nrow(data))
}
sc1 <- summary(c1,data)
# print("sum")
# print(max(sc1$classification))
jsc1 <- mergenormals(data,sc1,method=mergemethod,cutoff=cutoff,...)
# print(jsc1$clusternumbers)
# print("merge")
renumcl <- jsc1$clustering
# cln <- 1
# for (i in 1:max(jsc1$clustering)){
# if(sum(jsc1$clustering==i)>0){
# renumcl[jsc1$clustering==i] <- cln
# cln <- cln+1
# }
# }
cl <- list()
nc <- nccl <- max(jsc1$clustering)
if (sum(jsc1$clustering==0)>0){
nc <- nc+1
renumcl[jsc1$clustering==0] <- nc
}
for (i in 1:nc)
cl[[i]] <- renumcl == i
# if (nc>nccl)
# cl[[nc]] <- renumcl == 0
out <- list(result = jsc1, nc = nc, nccl = nccl, clusterlist = cl,
partition = renumcl, nnk = nnk, initnoise = as.logical(noise),
clustermethod = "mclust/mergenormals")
out
}
hclustCBI <- function(data,k,cut="number",method,scaling=TRUE,noisecut=0,...){
if(!identical(scaling,FALSE))
sdata <- scale(data,center=TRUE,scale=scaling)
else
sdata <- data
n <- nrow(data)
noise <- FALSE
c1 <- hclust(dist(sdata),method=method)
if (cut=="number")
partition <- cutree(c1,k=k)
else
partition <- cutree(c1,h=k)
cl <- list()
nc <- max(partition)
clsizes <- numeric(0)
for (i in 1:nc) clsizes[i] <- sum(partition==i)
ncn <- sum(clsizes>noisecut)
if (ncn<nc){
noise <- TRUE
newcln <- (1:nc)[clsizes>noisecut]
nc <- ncn+1
newpart <- rep(nc,n)
for (i in 1:ncn)
newpart[partition==newcln[i]] <- i
partition <- newpart
}
# print(nc)
# print(sc1)
for (i in 1:nc)
cl[[i]] <- partition==i
out <- list(result=c1,noise=noise,
nc=nc,clusterlist=cl,partition=partition,
clustermethod="hclust/cutree")
out
}
hclusttreeCBI <- function(data,minlevel=2,method,scaling=TRUE,...){
if(!identical(scaling,FALSE))
sdata <- scale(data,center=TRUE,scale=scaling)
else
sdata <- data
c1 <- hclust(dist(sdata),method=method)
n <- nrow(data)
clist <- list()
for (i in 1:n){
clist[[i]] <- rep(FALSE,n)
clist[[i]][i] <- TRUE
}
clcount <- n
for (j in 1:(n-2)){
clcount <- clcount+1
if (c1$merge[j,1]<0) clist1 <- clist[[-c1$merge[j,1]]]
else clist1 <- clist[[n+c1$merge[j,1]]]
if (c1$merge[j,2]<0) clist2 <- clist[[-c1$merge[j,2]]]
else clist2 <- clist[[n+c1$merge[j,2]]]
clist[[clcount]] <- clist2 | clist1
}
clusterlist <- list()
if (minlevel==1){
clusterlist <- clist
nc <- clcount
}
else{
for (j in (n+minlevel-1):clcount)
clusterlist[[j-minlevel-n+2]] <- clist[[j]]
nc <- clcount-minlevel-n+2
}
# print(nc)
# print(sc1)
out <- list(result=c1,nc=nc,clusterlist=clusterlist,partition=cutree(c1,2),
clustermethod="hclust, full tree")
out
}
# for clara & pam, data called x
claraCBI <- function(data,k,usepam=TRUE,diss=inherits(data,"dist"),...){
if (usepam)
c1 <- pam(data,k=k,diss=diss,...)
else
c1 <- clara(data,k=k,...)
partition <- c1$clustering
cl <- list()
nc <- k
# print(nc)
# print(sc1)
for (i in 1:nc)
cl[[i]] <- partition==i
out <- list(result=c1,nc=nc,clusterlist=cl,partition=partition,
clustermethod="clara/pam")
out
}
speccCBI <- function(data,k,...){
# require(kernlab)
data <- as.matrix(data)
options(show.error.messages = FALSE)
c1 <- try(specc(data,centers=k,...))
options(show.error.messages = TRUE)
if (inherits(c1,"try-error")){
partition <- rep(1,nrow(data))
cat("Function specc returned an error, probably a one-point cluster.\n All observations were classified to cluster 1.\n")
}
else
partition <- c1@.Data
cl <- list()
nc <- k
# print(nc)
# print(sc1)
for (i in 1:nc)
cl[[i]] <- partition==i
out <- list(result=c1,nc=nc,clusterlist=cl,partition=partition,
clustermethod="spectral")
out
}
# tclustCBI <- function(data,k,trim=0.05,...){
# if (requireNamespace("tclust", quietly = TRUE)) {
# data <- as.matrix(data)
# c1 <- tclust::tclust(data,k=k,alpha=trim,...)
# sc1c <- c1$cluster
# cl <- list()
# nc <- nccl <- max(sc1c)
# if (sum(sc1c==0)>0){
# nc <- nccl+1
# sc1c[sc1c==0] <- nc
# }
# for (i in 1:nc)
# cl[[i]] <- sc1c == i
# out <- list(result=c1,nc=nc,nccl=nccl,clusterlist=cl,partition=sc1c,
# clustermethod="tclust")
# out
# }
# else
# warning("tclust could not be loaded")
# }
# trimkmeansCBI <- function(data,k,scaling=TRUE,trim=0.1,...){
# c1 <- trimkmeans(data,k=k,scaling=scaling,trim=trim,...)
# partition <- c1$classification
# cl <- list()
# nc <- k+1
# nccl <- k
# # print(nc)
# # print(sc1)
# for (i in 1:nc)
# cl[[i]] <- partition==i
# out <- list(result=c1,nc=nc,clusterlist=cl,partition=partition,
# nccl=nccl,clustermethod="trimkmeans")
# out
# }
kmeansruns <- function(data,krange=2:10,criterion="ch",
iter.max=100,runs=100,
scaledata=FALSE,alpha=0.001,
critout=FALSE,plot=FALSE,...){
data <- as.matrix(data)
if (scaledata) data <- scale(data)
if (criterion=="asw") sdata <- dist(data)
cluster1 <- 1 %in% krange
crit <- numeric(max(krange))
km <- list()
for (k in krange){
if (k>1){
minSS <- Inf
kmopt <- NULL
for (i in 1:runs){
options(show.error.messages = FALSE)
#numUnique = length(unique(data))
#print(paste0("Num unique: ", numUnique))
#print(paste0("Num NOT unique: ", length(data)))
#repeat{
# cat(k," ",i,"before \n")
kmm <- try(kmeans(data,k,iter.max=iter.max,...))
# str(kmm)
if (inherits(kmm,"try-error")) {
stop(paste0("Cannot compute for k = ", k))
}
#}
options(show.error.messages = TRUE)
swss <- sum(kmm$withinss)
# print(calinhara(data,kmm$cluster))
if (swss<minSS){
kmopt <- kmm
minSS <- swss
}
if (plot){
par(ask=TRUE)
pairs(data,col=kmm$cluster,main=swss)
}
} # for i
km[[k]] <- kmopt
# print(km[[k]])
# print(calinhara(data,km[[k]]$cluster))
crit[k] <- switch(criterion,
asw=cluster.stats(sdata,km[[k]]$cluster)$avg.silwidth,
ch=calinhara(data,km[[k]]$cluster))
if (critout)
cat(k," clusters ",crit[k],"\n")
} # if k>1
} # for k
if (cluster1)
cluster1 <- dudahart2(data,km[[2]]$cluster,alpha=alpha)$cluster1
k.best <- which.max(crit)
if (cluster1)
k.best <- 1
# print(crit)
# print(k.best)
# print(km[[k.best]])
km[[k.best]]$crit <- crit
km[[k.best]]$bestk <- k.best
out <- km[[k.best]]
out
}
kmeansCBI <- function(data,krange,k=NULL,scaling=FALSE,runs=1,criterion="ch",...){
if (!is.null(k)) krange <- k
#print(paste0("k is ", krange))
if(!identical(scaling,FALSE))
sdata <- scale(data,center=TRUE,scale=scaling)
else
sdata <- data
c1 <- kmeansruns(sdata,krange,runs=runs,criterion=criterion,...)
partition <- c1$cluster
cl <- list()
nc <- c1$bestk
# print("---")
# print(nc)
# print(c1)
for (i in 1:nc)
cl[[i]] <- partition==i
out <- list(result=c1,nc=nc,clusterlist=cl,partition=partition,
clustermethod="kmeans")
out
}
rskcCBI <- function(data,krange,k=NULL,scaling=FALSE,alpha=0,L1=NULL,correlation=FALSE,
silent=TRUE,nstart = 200, ...) {
if (ncol(data) <= 1) {
stop("Cannot perform RSKC clustering. Input dataframe must at least provide two columns")
}
if (!is.null(k)) krange <- k
if (is.null(L1)) { # Compute automatic best L1 boundry
#print(paste0("No L1 provided. Computing best L1 boundry with 'sparcl::KMeansSparseCluster.permute'"))
wbounds = seq(2,sqrt(ncol(data)), len=30)
km.perm <- sparcl::KMeansSparseCluster.permute(data,K=krange,wbounds=wbounds,nperms=5,silent=TRUE)
L1 = km.perm$bestw
#print(paste0("Best L1 upper bound is: ", L1))
} # Else whatever the user want
c1 = RSKC(data, krange, alpha, L1, nstart = 200,
silent=silent, scaling=scaling,correlation=correlation)
partition <- c1$labels
cl <- list()
nc <- c1$ncl
for (i in 1:nc)
cl[[i]] <- partition==i
out <- list(result=c1,nc=nc,clusterlist=cl,partition=partition,
clustermethod="rskc")
out
}
pamkCBI <- function (data, krange = 2:10,k=NULL,
criterion="asw", usepam=TRUE,
scaling = FALSE, diss = inherits(data,"dist"), ...)
{
# require(cluster)
if (!is.null(k)) krange <- k
c1 <- pamk(data, krange = krange, criterion=criterion, usepam=usepam,
scaling = scaling, diss = diss,
...)
partition <- c1$pamobject$clustering
cl <- list()
nc <- c1$nc
# print(nc)
for (i in 1:nc) cl[[i]] <- partition == i
out <- list(result = c1, nc = nc, clusterlist = cl, partition = partition,
clustermethod = "pam/estimated k",criterion=criterion)
out
}
# for dbscan, data called x
dbscanCBI <- function(data,eps,MinPts,diss=inherits(data,"dist"),...){
if (diss)
c1 <- dbscan(data,eps,MinPts,method="dist",...)
else
c1 <- dbscan(data,eps,MinPts,...)
# plot(c1, data)
partition <- c1$cluster
cl <- list()
nccl <- max(partition)
partition[partition==0] <- nccl+1
nc <- max(partition)
# print(nc)
# print(sc1)
for (i in 1:nc)
cl[[i]] <- partition==i
out <- list(result=c1,nc=nc,nccl=nccl,clusterlist=cl,partition=partition,
clustermethod="dbscan")
out
}
mahalCBI <- function(data,clustercut=0.5,...){
c1 <- fixmahal(data,...)
partition <- rep(1,nrow(data))
cl <- fpclusters(c1)
nc <- length(cl)
if (nc>0){
for (i in 1:nc)
cl[[i]] <- as.integer(cl[[i]]>clustercut)
}
else{
nc <- 1
cl[[1]] <- as.integer(c1$g[[1]]>clustercut)
}
out <- list(result=c1,nc=nc,clusterlist=cl,partition=partition,
clustermethod="fixmahal")
out
}
# pdfclustCBI <- function(data,...){
# if (requireNamespace("pdfCluster", quietly = TRUE)) {
# c1 <- pdfCluster::pdfCluster(data,...)
# sc1c <- c1@clusters
# cl <- list()
# nc <- nccl <- max(sc1c)
# for (i in 1:nc)
# cl[[i]] <- sc1c == i
# out <- list(result=c1,nc=nc,nccl=nccl,clusterlist=cl,partition=sc1c,
# clustermethod="pdfCluster")
# out
# }
# else
# warning("pdfCluster could not be loaded")
# }
# distr could also be mvt, msn, mst
# emskewCBI <- function(data,k,distr="mst",repeats=100,...){
# if (requireNamespace("EMMIXskew", quietly = TRUE)) {
# if (!exists("distr")) distr="mvn"
# attempt <- 1
# repeat{
# c1 <- EMMIXskew::EmSkew(data,g=k,distr=distr,...)
# if (!is.null(c1)){
# if (max(c1$clust)==k)
# break
# }
# attempt <- attempt+1
# if(attempt>repeats){
# cat("EmSkew failed after ",repeats," repetitions.")
# break
# }
# if(debug) print("Repeat EmSkew exection")
# }
# partition <- c1$clust
# nc <- max(partition)
# cl <- list()
# for (i in 1:nc) cl[[i]] <- partition == i
# out <- list(result = c1, nc = nc, clusterlist = cl, partition = partition,
# clustermethod = paste("emskew_",distr,sep=""))
# out
# }
# else
# warning("EMMIXskew could not be loaded")
# }
compare_with_gold_standard <- function(data, gold_standard, clustermethod, datatomatrix, ...) {
if (datatomatrix) {
data <- as.matrix(data)
}
c1 <- clustermethod(data, ...)
randresult <- fossil::rand.index(gold_standard, c1$partition)
#message("Gold standard")
#message(gold_standard)
#message("Partitions")
#message(c1$partition)
#message("Rand Index: ")
#message(randresult)
out <- list(result = c1, partition = c1$partition,
nc = c1$nc, nccl = c1$nccl,
clustermethod = c1$clustermethod,
bootmean = randresult)
class(out) <- "no_bootstrap"
return(out)
}
clusterboot <- function(data,B=100,
distances=(inherits(data,"dist")),
bootmethod="boot",
bscompare=TRUE, multipleboot=FALSE,
jittertuning=0.05, noisetuning=c(0.05,4),
subtuning=floor(nrow(data)/2),
clustermethod,noisemethod=FALSE,
count=TRUE,
showplots=FALSE,dissolution=0.5,
recover=0.75,seed=NULL,datatomatrix=TRUE,
gold_standard=NULL,...){
sumlogic <- function(x,y,relation="eq")
switch (relation,
eq = sum(x==y, na.rm=TRUE),
s = sum(x<y, na.rm=TRUE),
l = sum(x>y, na.rm=TRUE),
se = sum(x<=y, na.rm=TRUE),
le = sum(x>=y, na.rm=TRUE))
if (!is.null(seed)) {
set.seed(seed)
}
# Avoid bootstrap if gold standard classification is set
if (!is.null(gold_standard)) {
return(compare_with_gold_standard(data = data, gold_standard = gold_standard,
clustermethod = clustermethod, datatomatrix=datatomatrix, ...))
}
invisible(distances)
if (datatomatrix)
data <- as.matrix(data)
if (distances & showplots & datatomatrix) dpoints <- cmdscale(data)
n <- nrow(data)
p <- ncol(data)
if (datatomatrix & !distances){
cod <- cov(data)
md <- colMeans(data)
}
lb <- length(bootmethod)
if (distances) {
# checkIfCanCluster(data=data, ...)
c1 <- clustermethod(as.dist(data),...)
} else {
# checkIfCanCluster(data=data, ...)
c1 <- clustermethod(data,...)
}
# print(noisemethod)
# print(str(c1))
if (noisemethod){
if (c1$nccl==0)
stop("No clusters, only noise estimated!")
}
else
c1$nccl <- c1$nc
bootresult <- jitterresult <- noiseresult <-
bojitresult <- subsetresult <- matrix(0,nrow=c1$nc,ncol=B)
if (("jitter" %in% bootmethod) | ("bojit" %in% bootmethod)){
if (!datatomatrix | distances) stop("datatomatrix=FALSE and distances require boot or subset as bootmethod.")
jsd <- numeric(0)
ecd <- eigen(cod, symmetric=TRUE)
ecd$values[ecd$values<0] <- 0
ecd$values[is.na(ecd$values)] <- 0
rotdata <- data %*% solve(t(ecd$vectors))
for (i in 1:p){
sx <- sort(rotdata[,i])
dx <- sx[2:n]-sx[1:(n-1)]
dx <- dx[dx>0]
jsd[i] <- quantile(dx,jittertuning)
}
}
if ("noise" %in% bootmethod){
if (!datatomatrix | distances) stop("datatomatrix=FALSE and distances require boot or subset as bootmethod.")
ecd <- eigen(cod, symmetric=TRUE)
ecd$values[ecd$values<0] <- 0
}
if (showplots & datatomatrix){
if (distances)
plot(dpoints,pch=sapply(c1$partition,toString),col=c1$partition)
else
plot(data,pch=sapply(c1$partition,toString),col=c1$partition)
}
for (l in 1:lb){
for (i in 1:B){
if (bootmethod[l]=="boot"){
bsamp <- sample(n,n,replace=TRUE)
if (!multipleboot) bsamp <- unique(bsamp)
if (distances)
mdata <- data[bsamp,bsamp]
else
mdata <- data[bsamp,]
}
if (bootmethod[l]=="subset"){
bsamp <- sample(n,subtuning,replace=FALSE)
if (distances)
mdata <- data[bsamp,bsamp]
else
mdata <- data[bsamp,]
}
if (bootmethod[l]=="jitter"){
jnoise <- matrix(0,ncol=p,nrow=n)
for (j in 1:p)
jnoise[,j] <- rnorm(n,sd=jsd[j])
jnoise <- jnoise %*% t(ecd$vectors)
mdata <- data+jnoise
bsamp <- 1:n
}
if (bootmethod[l]=="bojit"){
bsamp <- sample(n,n,replace=TRUE)
jnoise <- matrix(0,ncol=p,nrow=n)
for (j in 1:p)
jnoise[,j] <- rnorm(n,sd=jsd[j])
jnoise <- jnoise %*% t(ecd$vectors)
mdata <- data[bsamp,]+jnoise
}
if (bootmethod[l]=="noise"){
noiseind <- as.logical(rbinom(n,1,noisetuning[1]))
nn <- sum(noiseind)
jnoise <- matrix(0,ncol=p,nrow=nn)
for (j in 1:p)
jnoise[,j] <- runif(nn,min=-noisetuning[2]*sqrt(ecd$values[j]),
max=noisetuning[2]*sqrt(ecd$values[j]))
jnoise <- t(t(jnoise %*% t(ecd$vectors))+md)
mdata <- data
mdata[noiseind,] <- jnoise
bsamp <- (1:n)[!noiseind]
}
# print(mdata)
if ("diss" %in% names(formals(clustermethod)) & distances) {
# checkIfCanCluster(data=mdata, ...)
bc1 <- clustermethod(mdata,diss=TRUE,...)
} else {
# checkIfCanCluster(data=mdata, ...)
bc1 <- clustermethod(mdata,...)
}
# print("clustermethod done")
if (showplots & datatomatrix){
if (distances)
plot(dpoints[bsamp,],pch=sapply(bc1$partition,toString),
col=bc1$partition)
else
plot(mdata,pch=sapply(bc1$partition,toString),col=bc1$partition)
}
if (noisemethod){
effnc1 <- c1$nccl
effnb1 <- bc1$nccl
}
else{
effnc1 <- c1$nc
effnb1 <- bc1$nc
}
for (j in 1:effnc1){
maxgamma <- 0
if (effnb1>0){
for (k in 1:effnb1){
if (multipleboot){
if (bscompare) ncases <- 1:n
else{
ncases <- 1
# print(bsamp)
# print(str(bsamp))
m <- 2 # for (m in 2:n)
if (m<=n){
if (!(bsamp[m] %in% bsamp[1:(m-1)]))
ncases <- c(ncases,m)
m <- m+1
}
}
}
else ncases <- 1:length(bsamp)
# print(ncases)
# if (j==c1$nc){
# print(c1$clusterlist[[j]][bsamp][ncases])
# print(bc1$clusterlist[[k]][ncases])
# }
# print(bc1$nc)
# print(bc1$nccl)
# print(bc1$partition)
# print(str(bc1$clusterlist))
cg <- switch(bootmethod[l],
boot=clujaccard(c1$clusterlist[[j]][bsamp][ncases],
bc1$clusterlist[[k]][ncases],zerobyzero=0),
bojit=clujaccard(c1$clusterlist[[j]][bsamp][ncases],
bc1$clusterlist[[k]][ncases],zerobyzero=0),
subset=clujaccard(c1$clusterlist[[j]][bsamp],
bc1$clusterlist[[k]],zerobyzero=0),
jitter=clujaccard(c1$clusterlist[[j]],bc1$clusterlist[[k]],zerobyzero=0),
noise=clujaccard(c1$clusterlist[[j]][!noiseind],
bc1$clusterlist[[k]][!noiseind],zerobyzero=0))
if (cg>maxgamma) maxgamma <- cg
}
}
if (bootmethod[l]=="boot") bootresult[j,i] <- maxgamma
if (bootmethod[l]=="subset") subsetresult[j,i] <- maxgamma
if (bootmethod[l]=="bojit") bojitresult[j,i] <- maxgamma
if (bootmethod[l]=="jitter") jitterresult[j,i] <- maxgamma
if (bootmethod[l]=="noise") noiseresult[j,i] <- maxgamma
}
if (noisemethod){
if (c1$nc>c1$nccl){
j <- c1$nc
if (bc1$nc>bc1$nccl)
maxgamma <- switch(bootmethod[l],
boot=clujaccard(c1$clusterlist[[c1$nc]][bsamp][ncases],
bc1$clusterlist[[bc1$nc]][ncases],zerobyzero=0),
bojit=clujaccard(c1$clusterlist[[c1$nc]][bsamp][ncases],
bc1$clusterlist[[bc1$nc]][ncases],zerobyzero=0),
subset=clujaccard(c1$clusterlist[[c1$nc]][bsamp],
bc1$clusterlist[[bc1$nc]],zerobyzero=0),
jitter=clujaccard(c1$clusterlist[[c1$nc]],bc1$clusterlist[[bc1$nc]],zerobyzero=0),
noise=clujaccard(c1$clusterlist[[c1$nc]][!noiseind],
bc1$clusterlist[[bc1$nc]][!noiseind],zerobyzero=0))
else
maxgamma <- 0
if (bootmethod[l]=="boot") bootresult[j,i] <- maxgamma
if (bootmethod[l]=="subset") subsetresult[j,i] <- maxgamma
if (bootmethod[l]=="bojit") bojitresult[j,i] <- maxgamma
if (bootmethod[l]=="jitter") jitterresult[j,i] <- maxgamma
if (bootmethod[l]=="noise") noiseresult[j,i] <- maxgamma
}
}
}
}
if (!("boot" %in% bootmethod))
bootresult <- bootmean <- bootbrd <- bootrecover <- NULL
else{
bootmean=apply(bootresult,1,mean, na.rm=TRUE)
bootbrd=apply(bootresult,1,sumlogic,y=dissolution,relation="se")
bootrecover=apply(bootresult,1,sumlogic,y=recover,relation="l")
}
if (!("jitter" %in% bootmethod))
jitterresult <- jittermean <- jitterbrd <- jitterrecover <- NULL
else{
jittermean=apply(jitterresult,1,mean, na.rm=TRUE)
jitterbrd=apply(jitterresult,1,sumlogic,y=dissolution,relation="se")
jitterrecover=apply(jitterresult,1,sumlogic,y=recover,relation="l")
}
if (!("subset" %in% bootmethod)) subsetresult <- subsetmean <-
subsetbrd <- subsetrecover <- NULL
else{
subsetmean=apply(subsetresult,1,mean, na.rm=TRUE)
subsetbrd=apply(subsetresult,1,sumlogic,y=dissolution,relation="se")
subsetrecover=apply(subsetresult,1,sumlogic,y=recover,relation="l")
}
if (!("noise" %in% bootmethod)) noiseresult <- noisemean <-
noisebrd <- noiserecover <- NULL
else{
noisemean=apply(noiseresult,1,mean, na.rm=TRUE)
noisebrd=apply(noiseresult,1,sumlogic,y=dissolution,relation="se")
noiserecover=apply(noiseresult,1,sumlogic,y=recover,relation="l")
}
if (!("bojit" %in% bootmethod)) bojitresult <- bojitmean <-
bojitbrd <- bojitrecover <- NULL
else{
bojitmean=apply(bojitresult,1,mean, na.rm=TRUE)
bojitbrd=apply(bojitresult,1,sumlogic,y=dissolution,relation="se")
bojitrecover=apply(bojitresult,1,sumlogic,y=recover,
relation="l")
}
if (showplots & datatomatrix){
if (distances)
plot(dpoints,pch=sapply(c1$partition,toString),col=c1$partition)
else
plot(data,pch=sapply(c1$partition,toString),col=c1$partition)
}
out <- list(result=c1,partition=c1$partition,
nc=c1$nc, nccl=c1$nccl,
clustermethod=c1$clustermethod,
B=B,
noisemethod=noisemethod,
bootmethod=bootmethod,
multipleboot=multipleboot,
dissolution=dissolution,
recover=recover,
bootresult=bootresult,
bootmean=bootmean,
bootbrd=bootbrd,
bootrecover=bootrecover,
jitterresult=jitterresult,
jittermean=jittermean,
jitterbrd=jitterbrd,
jitterrecover=jitterrecover,
subsetresult=subsetresult,
subsetmean=subsetmean,
subsetbrd=subsetbrd,
subsetrecover=subsetrecover,
bojitresult=bojitresult,
bojitmean=bojitmean,
bojitbrd=bojitbrd,
bojitrecover=bojitrecover,
noiseresult=noiseresult,
noisemean=noisemean,
noisebrd=noisebrd,
noiserecover=noiserecover)
class(out) <- "clboot"
out
}
print.clboot <- function(x, statistics=c("mean","dissolution","recovery"),...){
cat("* Cluster stability assessment *\n")
cat("Cluster method: ",x$clustermethod,"\n")
cat("Full clustering results are given as parameter result\n")
cat("of the clusterboot object, which also provides further statistics\n")
cat("of the resampling results.\n")
cat("Number of resampling runs: ",x$B,"\n\n")
cat("Number of clusters found in data: ",x$nc,"\n\n")
if (x$noisemethod & x$nc>x$nccl)
cat("The last cluster corresponds to noise/outliers\n\n")
for (i in 1:length(x$bootmethod)){
if (x$bootmethod[i]=="boot"){
cat(" Clusterwise Jaccard bootstrap ")
if (!x$multipleboot) cat("(omitting multiple points) ")
if ("mean" %in% statistics){
cat("mean:\n")
print(x$bootmean)
}
if ("dissolution" %in% statistics){
cat("dissolved:\n")
print(x$bootbrd)
}
if ("recovery" %in% statistics){
cat("recovered:\n")
print(x$bootrecover)
}
}
if (x$bootmethod[i]=="subset"){
cat(" Clusterwise Jaccard subsetting ")
if ("mean" %in% statistics){
cat("mean:\n")
print(x$subsetmean)
}
if ("dissolution" %in% statistics){
cat("dissolved:\n")
print(x$subsetbrd)
}
if ("recovery" %in% statistics){
cat("recovered:\n")
print(x$subsetrecover)
}
}
if (x$bootmethod[i]=="jitter"){
cat(" Clusterwise Jaccard jittering ")
if ("mean" %in% statistics){
cat("mean:\n")
print(x$jittermean)
}
if ("dissolution" %in% statistics){
cat("dissolved:\n")
print(x$jitterbrd)
}
if ("recovery" %in% statistics){
cat("recovered:\n")
print(x$jitterrecover)
}
}
if (x$bootmethod[i]=="noise"){
cat(" Clusterwise Jaccard replacement by noise ")
if ("mean" %in% statistics){
cat("mean:\n")
print(x$noisemean)
}
if ("dissolution" %in% statistics){
cat("dissolved:\n")
print(x$noisebrd)
}
if ("recovery" %in% statistics){
cat("recovered:\n")
print(x$noiserecover)
}
}
if (x$bootmethod[i]=="bojit"){
cat(" Clusterwise Jaccard bootstrap/jittering ")
if ("mean" %in% statistics){
cat("mean:\n")
print(x$bojitmean)
}
if ("dissolution" %in% statistics){
cat("dissolved:\n")
print(x$bojitbrd)
}
if ("recovery" %in% statistics){
cat("recovered:\n")
print(x$bojitrecover)
}
}
}
invisible(x)
}
# @export
plot.clboot <- function(x,xlim=c(0,1),breaks=seq(0,1,by=0.05),...){
par(mfrow=c(x$nc,length(x$bootmethod)))
for (j in 1:x$nc)
for (i in 1:length(x$bootmethod)){
if (x$bootmethod[i]=="boot")
hist(x$bootresult[j,],xlim=xlim,breaks=breaks,
xlab="Jaccard similarity",
main=paste("Bootstrap, cluster ",j))
if (x$bootmethod[i]=="subset")
hist(x$subsetresult[j,],xlim=xlim,breaks=breaks,
xlab="Jaccard similarity",
main=paste("Subsetting, cluster ",j))
if (x$bootmethod[i]=="jitter")
hist(x$jitterresult[j,],xlim=xlim,breaks=breaks,
xlab="Jaccard similarity",
main=paste("Jittering, cluster ",j))
if (x$bootmethod[i]=="noise")
hist(x$noiseresult[j,],xlim=xlim,breaks=breaks,
xlab="Jaccard similarity",
main=paste("Replacement by noise, cluster ",j))
if (x$bootmethod[i]=="bojit")
hist(x$bojitresult[j,],xlim=xlim,breaks=breaks,
xlab="Jaccard similarity",
main=paste("Bootstrap/jittering, cluster ",j))
}
par(mfrow=c(1,1))
invisible()
}
# disttrimkmeansCBI <- function(dmatrix,k,scaling=TRUE,trim=0.1,
# mdsmethod="classical",
# mdsdim=4,...){
# dmatrix <- as.matrix(dmatrix)
# n <- ncol(dmatrix)
# # require(MASS)
# if (mdsmethod != "classical") {
# mindm <- min(dmatrix[dmatrix > 0])/10
# dmatrix[dmatrix<mindm] <- mindm
# }
# data <- switch(mdsmethod, classical = cmdscale(dmatrix, k = mdsdim),
# kruskal = isoMDS(dmatrix, k = mdsdim)$points, sammon =
# sammon(dmatrix, k = mdsdim)$points)
# c1 <- trimkmeans(data,k=k,scaling=scaling,trim=trim,...)
# partition <- c1$classification
# cl <- list()
# nccl <- k
# nc <- k+1
# # print(nc)
# # print(sc1)
# for (i in 1:nc)
# cl[[i]] <- partition==i
# out <- list(result=c1,nc=nc,nccl=nccl,clusterlist=cl,partition=partition,
# clustermethod="trimkmeans plus MDS")
# out
# }
disthclustCBI <- function(dmatrix,k,cut="number",method,noisecut=0,...){
n <- nrow(as.matrix(dmatrix))
c1 <- hclust(as.dist(dmatrix),method=method)
noise <- FALSE
if (cut=="number")
partition <- cutree(c1,k=k)
else
partition <- cutree(c1,h=k)
nc <- max(partition)
clsizes <- numeric(0)
for (i in 1:nc) clsizes[i] <- sum(partition==i)
ncn <- sum(clsizes>noisecut)
if (ncn<nc){
noise <- TRUE
newcln <- (1:nc)[clsizes>noisecut]
nc <- ncn+1
newpart <- rep(nc,n)
for (i in 1:ncn)
newpart[partition==newcln[i]] <- i
partition <- newpart
}
cl <- list()
# print(nc)
# print(sc1)
for (i in 1:nc)
cl[[i]] <- partition==i
out <- list(result=c1,noise=noise,nc=nc,nccl=ncn,
clusterlist=cl,partition=partition,
clustermethod="hclust")
out
}
disthclusttreeCBI <- function(dmatrix,minlevel=2,method,...){
n <- nrow(as.matrix(dmatrix))
c1 <- hclust(as.dist(dmatrix),method=method)
clist <- list()
for (i in 1:n){
clist[[i]] <- rep(FALSE,n)
clist[[i]][i] <- TRUE
}
clcount <- n
for (j in 1:(n-2)){
clcount <- clcount+1
if (c1$merge[j,1]<0) clist1 <- clist[[-c1$merge[j,1]]]
else clist1 <- clist[[n+c1$merge[j,1]]]
if (c1$merge[j,2]<0) clist2 <- clist[[-c1$merge[j,2]]]
else clist2 <- clist[[n+c1$merge[j,2]]]
clist[[clcount]] <- clist2 | clist1
}
clusterlist <- list()
if (minlevel==1){
clusterlist <- clist
nc <- clcount
}
else{
for (j in (n+minlevel-1):clcount)
clusterlist[[j-minlevel-n+2]] <- clist[[j]]
nc <- clcount-minlevel-n+2
}
# print(nc)
# print(sc1)
out <- list(result=c1,nc=nc,clusterlist=clusterlist,partition=cutree(c1,2),
clustermethod="hclust, full tree")
out
}
classifnp <- function (data, clustering, method = "centroid", cdist = NULL,
centroids = NULL, nnk = 1)
{
data <- as.matrix(data)
k <- max(clustering)
p <- ncol(data)
n <- nrow(data)
topredict <- clustering < 0
if (method == "averagedist") {
if (is.null(cdist))
cdist <- as.matrix(dist(data))
else cdist <- as.matrix(cdist)
prmatrix <- matrix(0, ncol = k, nrow = sum(topredict))
for (j in 1:k) prmatrix[, j] <- rowMeans(as.matrix(cdist[topredict,
clustering == j]))
clpred <- apply(prmatrix, 1, which.min)
clustering[topredict] <- clpred
}
if (method == "centroid") {
if (is.null(centroids)) {
centroids <- matrix(NA, ncol = p, nrow = k)
grvec <- numeric(0)
for (j in 1:k)
if (sum(clustering==j)>0){
centroids[j, ] <- colMeans(as.matrix(data[clustering == j, ]))
grvec <- c(grvec,j)
}
}
else
grvec <- 1:k
# print(centroids)
clustering[topredict] <- knn1(centroids[grvec,], data[topredict,
], grvec)
}
if (method == "qda") {
qq <- try(qda(data[!topredict, ], grouping = as.factor(clustering[!topredict])),
silent = TRUE)
if (identical(attr(qq, "class"), "try-error"))
qq <- lda(data[!topredict, ], grouping = as.factor(clustering[!topredict]))
clustering[topredict] <- as.integer(predict(qq, data[topredict,
])$class)
}
if (method == "knn")
clustering[topredict] <- as.integer(knn(data[!topredict,
], data[topredict, ], as.factor(clustering[!topredict]),
k = nnk))
clustering
}
# method centroids doesn't allow centroids=NULL
classifdist <- function(cdist,clustering,
method="averagedist",
centroids=NULL,nnk=1){
k <- max(clustering)
n <- nrow(data)
cdist <- as.matrix(cdist)
topredict <- clustering<0
if(method=="averagedist"){
prmatrix <- matrix(0,ncol=k,nrow=sum(topredict))
for (j in 1:k)
prmatrix[,j] <- rowMeans(as.matrix(cdist[topredict,clustering==j]))
clpred <- apply(prmatrix,1,which.min)
clustering[topredict] <- clpred
}
# print("centroids&cdist")
# print(centroids)
# print(cdist[topredict,centroids,drop=FALSE])
if(method=="centroid")
clustering[topredict] <- apply(cdist[topredict,centroids,drop=FALSE],1,which.min)
if(method=="knn"){
cdist[topredict,topredict] <- max(cdist)+1
if (nnk==1){
bestobs <- apply(cdist[topredict,,drop=FALSE],1,which.min)
clustering[topredict] <- clustering[bestobs]
}
else{
for(i in (1:n)[topredict]){
bestobs <- order(cdist[i,])[1:nnk]
clasum <- numeric(0)
for (j in 1:k)
clasum[j] <- sum(clustering[bestobs]==j)
clustering[i] <- which.max(clasum)
}
}
}
clustering
}
nselectboot <- function (data, B = 50, distances = inherits(data, "dist"), clustermethod = NULL,
classification = "averagedist", krange = 2:10, count = FALSE,
nnk = 1, ...)
{
dista <- distances
data <- as.matrix(data)
if (classification == "average") {
if (dista)
dmat <- data
else dmat <- as.matrix(dist(data))
}
stab <- matrix(0, nrow = B, ncol = max(krange))
n <- nrow(data)
for (k in krange) {
if (count)
cat(k, " clusters\n")
for (i in 1:B) {
if (count)
print(i)
d1 <- sample(n, n, replace = TRUE)
d2 <- sample(n, n, replace = TRUE)
if (dista) {
dmat1 <- data[d1, d1]
dmat2 <- data[d2, d2]
}
else {
dmat1 <- data[d1, ]
dmat2 <- data[d2, ]
}
# print("start clustermethod")
# if ("diss" %in% names(formals(clustermethod)) & distances){
if (distances){
clm1 <- clustermethod(as.dist(dmat1), k = k, diss=TRUE, ...)
clm2 <- clustermethod(as.dist(dmat2), k = k, diss=TRUE, ...)
}
else{
clm1 <- clustermethod(dmat1, k = k, ...)
# cl1 <- clm1$partition
# print("done 1")
clm2 <- clustermethod(dmat2, k = k, ...)
}
cl2 <- clm2$partition
# print("done 2")
cj1 <- cj2 <- rep(-1, n)
# print(d1)
# print(clm1$partition)
# print(d2)
# print(clm2$partition)
cj1[d1] <- clm1$partition
cj2[d2] <- clm2$partition
# browser()
if (dista) {
cj1 <- classifdist(data, cj1, method = classification,
centroids = clm1$result$medoids, nnk = nnk)
cj2 <- classifdist(data, cj2, method = classification,
centroids = clm2$result$medoids, nnk = nnk)
}
else {
centroids <- NULL
if (classification == "centroid") {
if (identical(clustermethod, kmeansCBI)){
centroids1 <- clm1$result$centers
centroids2 <- clm2$result$centers
}
if (identical(clustermethod, claraCBI)){
centroids1 <- clm1$result$medoids
centroids2 <- clm2$result$medoids
}
}
# print("classifnp")
cj1 <- classifnp(data, cj1, method = classification,
centroids = centroids1, nnk = nnk)
# print("done 1")
cj2 <- classifnp(data, cj2, method = classification,
centroids = centroids2, nnk = nnk)
# plot(data,col=cj1,pch=cj1,main=paste(k,"clusters a -",i))
# plot(data,col=cj2,pch=cj2,main=paste(k,"clusters b -",i))
# print("done 2")
# browser()
}
# print(" for n loop")
# j <- 1
ctable <- table(cj1,cj2)
nck1 <- rowSums(ctable)
stab[i,k] <- sum(nck1^2-rowSums(ctable^2))
# browser()
# for(j in 1:k)
# for(j in 1:n){
# cj1e <- cj1==cj1[j]
# cj1u <- !cj1e
# cj2e <- cj2==cj2[j]
# cj2u <- !cj2e
# stab[i,k] <- stab[i,k]+sum(cj1e & cj2u)+sum(cj2e & cj1u)
# j <- j+1 }
# print("done")
} # for i
} # for k
stab <- stab/n^2
stabk <- rep(NA, max(krange))
# browser()
for (k in krange) stabk[k] <- mean(stab[, k])
kopt <- which.min(stabk)
out <- list(kopt = kopt, stabk = stabk, stab = stab)
}
# nselectboot <- function (data, B = 50, distances = inherits(data, "dist"), clustermethod = NULL,
# classification = "averagedist", krange = 2:10, count = FALSE,
# nnk = 1, ...)
# {
# dista <- distances
# data <- as.matrix(data)
# if (classification == "average") {
# if (dista)
# dmat <- data
# else dmat <- as.matrix(dist(data))
# }
# stab <- matrix(0, nrow = B, ncol = max(krange))
# n <- nrow(data)
# for (k in krange) {
# if (count)
# cat(k, " clusters\n")
# for (i in 1:B) {
# if (count)
# print(i)
# d1 <- sample(n, n, replace = TRUE)
# d2 <- sample(n, n, replace = TRUE)
# if (dista) {
# dmat1 <- data[d1, d1]
# dmat2 <- data[d2, d2]
# }
# else {
# dmat1 <- data[d1, ]
# dmat2 <- data[d2, ]
# }
# clm1 <- clustermethod(dmat1, k = k, ...)
# clm2 <- clustermethod(dmat2, k = k, ...)
# cj1 <- cj2 <- rep(-1, n)
# cj1[d1] <- clm1$partition
# cj2[d2] <- clm2$partition
# if (dista) {
# cj1 <- classifdist(data, cj1, method = classification,
# centroids = clm1$result$medoids, nnk = nnk)
# cj2 <- classifdist(data, cj2, method = classification,
# centroids = clm2$result$medoids, nnk = nnk)
# }
# else {
# centroids <- NULL
# if (classification == "centroid") {
# if (identical(clustermethod, kmeansCBI)){
# centroids1 <- clm1$result$centers
# centroids2 <- clm2$result$centers
# }
# if (identical(clustermethod, claraCBI)){
# centroids1 <- clm1$result$medoids
# centroids2 <- clm2$result$medoids
# }
# }
# cj1 <- classifnp(data, cj1, method = classification,
# centroids = centroids1, nnk = nnk)
# cj2 <- classifnp(data, cj2, method = classification,
# centroids = centroids2, nnk = nnk)
# }
# j <- 1
# if (j<=n){
# cj1e <- cj1==cj1[j]
# cj1u <- !cj1e
# cj2e <- cj2==cj2[j]
# cj2u <- !cj2e
# stab[i,k] <- stab[i,k]+sum(cj1e & cj2u)+sum(cj2e & cj1u)
# j <- j+1
# }
# }
# }
# stab <- stab/n^2
# stabk <- rep(NA, max(krange))
# for (k in krange) stabk[k] <- mean(stab[, k])
# kopt <- which.min(stabk)
# out <- list(kopt = kopt, stabk = stabk, stab = stab)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.