Nothing
#' Allocation of observations to pre-established cluster centers.
#'
#'
#' Here, observations of a dataset are allocated to a set of preestablished
#' cluster centers. This is intended to be used for the test set in train-test
#' dataset situations.
#' @importFrom moments kurtosis
#' @param inDataFrame A dataframe or matrix with the data that that the cluster
#' centers will be allocated to. This data should be scaled in the same way as
#' the data for the original depeche was scaled when it entered the algorithm,
#' i.e. in the normal case, not at all.
#' @param clusterCenters A matrix that needs to be inherited from a depeche run.
#' It contains the information about which clusters and variables that have been
#' sparsed away and where the cluster centers are located for the remaining
#' clusters and variables.
#' @param log2Off If the automatic detection for high kurtosis, and followingly,
#' the log2 transformation, should be turned off.
#' @param noZeroNum For internal use. Controls the that the internal
#' algorithm returns a cluster with number 0.
#' @seealso \code{\link{depeche}}
#' @return A vector with the same length as number of rows in the inDataFrame,
#' where the cluster identity of each observation is noted.
#'
#' @examples
#' # Retrieve some example data
#' data(testData)
#' \dontrun{
#' # Now arbitrarily (for the sake of the example) divide the data into a
#' # training- and a test set.
#' testDataSample <- sample(1:nrow(testData), size = 10000)
#' testDataTrain <- testData[testDataSample, ]
#' testDataTest <- testData[-testDataSample, ]
#'
#' # Run the depeche function for the train set
#'
#' x_depeche_train <- depeche(testDataTrain[, 2:15],
#' maxIter = 20,
#' sampleSize = 1000
#' )
#'
#' # Allocate the test dataset to the centers of the train dataset
#' x_depeche_test <- dAllocate(testDataTest[, 2:15],
#' clusterCenters = x_depeche_train$clusterCenters
#' )
#'
#' # And finally plot the two groups to see how great the overlap was:
#' trainTablePerId <- apply(as.matrix(table(
#' testDataTrain$ids,
#' x_depeche_train$clusterVector
#' )), 1, function(x) x / sum(x))
#' trainTableCollapsed <- apply(trainTablePerId, 1, sum)
#' trainTableFraction <- trainTableCollapsed / sum(trainTableCollapsed)
#' testTablePerId <- apply(
#' as.matrix(table(testDataTest$ids, x_depeche_test)),
#' 1, function(x) x / sum(x)
#' )
#' testTableCollapsed <- apply(testTablePerId, 1, sum)
#' testTableFraction <- testTableCollapsed / sum(testTableCollapsed)
#' xmatrix <- t(cbind(trainTableFraction, testTableFraction))
#' library(gplots)
#' barplot2(xmatrix, beside = TRUE, legend = rownames(xmatrix))
#' title(main = "Difference between train and test set")
#' title(xlab = "Clusters")
#' title(ylab = "Fraction")
#' }
#' @export dAllocate
dAllocate <- function(inDataFrame, clusterCenters, log2Off = FALSE,
noZeroNum = TRUE) {
if (is.data.frame(inDataFrame)) {
inDataFrame <- as.matrix(inDataFrame)
}
if (log2Off == FALSE && kurtosis(as.vector(inDataFrame)) > 100) {
kurtosisValue1 <- kurtosis(as.vector(inDataFrame))
# Here, the log transformation is
# performed. In cases where the lowest
# value is 0, everything is simple. In
# other cases, a slightly more
# complicated formula is needed
if (min(inDataFrame) >= 0) {
inDataFrame <- log2(inDataFrame + 1)
} else {
# First, the data needs to be reasonably
# log transformed to not too extreme
# values, but still without loosing
# resolution.
inDataFrameLog <- log2(apply(
inDataFrame, 2,
function(x) x - min(x)
) + 1)
# Then, the extreme negative values will
# be replaced by 0, as they give rise to
# artefacts.
inDataFrameLog[which(is.nan(inDataFrameLog))] <- 0
}
kurtosisValue2 <- kurtosis(as.vector(inDataFrame))
message(
"The data was found to be heavily tailed (kurtosis ",
kurtosisValue1, "). Therefore, it was log2-transformed, ",
"leading to a new kurtosis value of ", kurtosisValue2, "."
)
}
# Here, all variables that do not
# contribute to defining a single cluster
# is removed.
clusterCentersReduced <-
clusterCenters[
which(rowSums(clusterCenters) != 0),
which(colSums(clusterCenters) != 0)
]
# If some variables have been excluded as
# they did not contribute to construction
# of any cluster, they are removed from
# the inData here. The special case with only one variable is taken
# into account.
# There are two different methods here: one for external and one for
# internal use. In the first case, there are no colnumn names, but the
# properties of the cluster centers are also more raw and thus informative.
if (length(colnames(clusterCenters)) > 0) {
inDataFrameReduced <- inDataFrame[, colnames(clusterCenters)]
} else {
inDataFrameReduced <-
inDataFrame[, which(colSums(clusterCenters) != 0)]
}
# Here, a specific case, namely that only one variable contains
# meaningful information, is taken into account.
if (is.vector(inDataFrameReduced)) {
clusterCentersReduced <- as.matrix(clusterCentersReduced)
inDataFrameReduced <- as.matrix(inDataFrameReduced)
}
clusterReallocationResult <- allocate_points(
inDataFrameReduced,
clusterCentersReduced, 1
)[[1]]
# As allocate_points spontaneously likes to throw out a cluster called 0,
# this behaviour is controlled here
if (noZeroNum) {
clusterReallocationResult <- clusterReallocationResult + 1
}
return(clusterReallocationResult)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.