######################################################
#function
# jaccard(x, y)
# for two binary variables (values 0,1)
jaccard = function (x, y) {
c11 = sum(x == 1 & y == 1)
c10 = sum(x == 1 & y == 0)
c01 = sum(x == 0 & y == 1)
return (c11 / (c11 + c10 + c01))
}
######################################################
#function
# jaccard.cluster(clust1, clust2)
# for two clusterings from a data set
jaccard.cluster <- function(clust1,clust2) {
cat <- sort(unique(c(clust1,clust2)))
aux <- rep(0,length(cat))
if (length(clust1)==length(clust2) & all(sort(unique(clust1))==sort(unique(clust2)))) {
for (i.aux in cat) {
clust.x <- clust1==i.aux
for (j.aux in cat) {
clust.y <- clust2==j.aux
aux[i.aux] <- max(aux[i.aux],jaccard(clust.x,clust.y))
}
}
}
return(aux)
}
######################################################
#function
# boot.cluster(data, nk=5, B=100, seed=NULL)
# stability cluster based on Jaccard by clustering bootstrap
boot.cluster <- function(data, nk=5, B=10, seed=NULL, prnt=FALSE) {
old.seed <- .Random.seed
on.exit( { .Random.seed <- old.seed } )
if (!is.null(seed)) {
#http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
set.seed(as.integer(seed)) #seed
} else {
# Default seed
set.seed(pkg.env$seed)
}
data <- as.matrix(data)
n.data <- nrow(data)
cluster1 <- kmeans(x=data, centers=nk, iter.max=100)
cluster1$partition <- cluster1$cluster
cluster1$nk <- nk
bs.jaccard <- matrix(0,nrow=nk, ncol=B)
for (i in 1:B){ # for B botstrap replications
if (prnt) cat("bootstrap replicate ", i,"\n") # Show the bootstrap replication index
bscheck <- TRUE
while (bscheck) {
bs.samp <- sample(n.data,n.data,replace=TRUE)
bs.samp <- unique(bs.samp)
bs.data <- data[bs.samp,]
if (nk <= length(levels(as.factor(bs.data))))
bscheck <- FALSE
}
if (length(bs.data) <= nk) { # Not enough data for kmeans
message("\tWarning: Could not process data for k = ", nk)
return (NULL)
}
bs.cluster <- kmeans(bs.data, centers=nk, iter.max=100)
bs.cluster$partition <- bs.cluster$cluster
bs.cluster$nk <- nk
bs.jaccard[,i] <- jaccard.cluster(cluster1$partition[bs.samp], bs.cluster$partition)
} # end for i in 1:B
bs.jaccard.mean <- apply(bs.jaccard, 1, mean, na.rm=TRUE) # 1=by rows
return(list(result=cluster1, partition=cluster1$partition, nk=cluster1$nk,
B=B, bsjaccard=bs.jaccard, means=bs.jaccard.mean))
}
######################################################
######################################################
#function
# fAnova(clusterResult, k, num.elements)
# Custom anova computation: F function.
fAnova <- function(clusterResult, k, n) {
#cat("k: ", k, "\n")
#cat("clusterResult$betweenss: ", clusterResult$betweenss, "\n")
#cat("clusterResult$tot.withinss: ", clusterResult$tot.withinss, "\n")
#cat("(k-1): ", (k-1), "\n")
#cat("(n-k): ", (n-k), "\n")
#cat("d1: ", clusterResult$betweenss/(k-1), "\n")
#cat("d2: ", (clusterResult$tot.withinss)/(n-k), "\n")
#cat("r: ", (clusterResult$betweenss/(k-1))/(clusterResult$tot.withinss)/(n-k), "\n")
#("r: ", (clusterResult$betweenss/(k-1))/((clusterResult$tot.withinss)/(n-k)), "\n")
return ((clusterResult$betweenss/(k-1))/((clusterResult$tot.withinss)/(n-k)))
}
######################################################
######################################################
#function
# getMeasureValue(km5, measureName)
# Returns a measure as in km5$measureName
getMeasureValue <- function(km5, measureName) {
if (startsWith(measureName, "cluster_")) {
return (toString(unlist(km5$csv[measureName], use.names = FALSE)))
} else {
stop("Unknown measure '", measureName, "'. Stopping.")
}
}
######################################################
######################################################
#function
# clusterbootWrapper(data, B, bootmethod="boot",
# clustermethod=kmeansCBI, krange, seed, ...)
# Wrapper method for clusterboot functionality.
clusterbootWrapper <- function(data, B, bootmethod="boot",
cbi, krange, gold_standard, seed, ...) {
cbiHelperResult = helperGetCBI(cbi, krange, ...)
#cat("Using: ", cbi, "\n")
#cat("Type: ", typeof(cbiHelperResult[["method"]]), "\n")
#print(cbiHelperResult)
mandatoryArgs = list(
"data"=data,
"B"=B,
"bootmethod"=bootmethod,
"gold_standard"=gold_standard,
"seed"=seed,
"clustermethod"=cbiHelperResult[["method"]]
)
methodArgs = append(mandatoryArgs, cbiHelperResult[["args"]])
# Append parameters that the user might have specified in the ellipsis
methodArgs = append(methodArgs, list(...))
return (
#quiet(
do.call(
clusterboot,
methodArgs
# )
)
)
}
######################################################
######################################################
#function
# clusteringWrapper(data, cbi, krange, seed)
# Wrapper method for clustering without bootstrap functionality.
clusteringWrapper <- function(data, cbi, krange, seed, ...) {
cbiHelperResult = helperGetCBI(cbi, krange, ...)
if(exists(".Random.seed")){
# .Random.seed might not exist when launched as background job
# so only store and reset if it exists
old.seed <- .Random.seed
}
on.exit(
{
if(exists("old.seed")) {
.Random.seed <<- old.seed
}
}
)
if (!is.null(seed)) set.seed(seed)
#cat ("Using: ", cbi, "\n")
mandatoryArgs = list(
"data"=data
)
methodArgs = append(mandatoryArgs, cbiHelperResult[["args"]])
# Append parameters that the user might have specified in the ellipsis
methodArgs = append(methodArgs, list(...))
# print(cbiHelperResult[["method"]])
# print(methodArgs)
return (
quiet(
do.call(
cbiHelperResult[["method"]],
methodArgs
)
)
)
}
######################################################
######################################################
#function
# checkIfCanCluster(data, k)
# It prevents clusterboot method from getting stuck by checking if
# there is enough data to perform a clustering.
checkIfCanCluster <- function(data, ...) {
mc <- as.list(match.call(expand.dots = TRUE))
k = NULL
if ("krange" %in% names(mc)) {
k = mc[["krange"]]
} else if ("k" %in% names(mc) ) {
k = mc[["k"]]
} else {
stop(paste0("Unexpected error. Could not retrieve 'k' value from dot-dot-dot argument. ",
"Arguments were: ", names(mc)))
}
numUnique = length(unique(data))
#print(paste0("Not unique:", length(data)))
#print(paste0("Unique:", numUnique))
#print(paste0("Division is:", numUnique/k))
#print(paste0("Do I stop?: ", (numUnique/k) < 1))
if ((numUnique/k) < 1) {
stop(paste0("Not enough data to cluster '", numUnique, "' unique values in '", k, "' clusters"))
}
}
######################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.