.initializeCluster <- function(N,
len,
z = NULL,
initial = NULL,
fixed = NULL) {
# If initial values are given, then they will not be randomly initialized
if (!is.null(initial)) {
initValues <- sort(unique(initial))
if (length(unique(initial)) != N || length(initial) != len ||
!all(initValues %in% seq(N))) {
stop(
"'initial' needs to be a vector of length 'len'",
" containing N unique values."
)
}
z <- as.integer(as.factor(initial))
} else {
z <- rep(NA, len)
}
# Set any values that need to be fixed during sampling
if (!is.null(fixed)) {
fixedValues <- sort(unique(fixed))
if (length(fixed) != len || !all(fixedValues %in% seq(N))) {
stop(
"'fixed' to be a vector of length 'len' where each entry is",
" one of N unique values or NA."
)
}
fixedIx <- !is.na(fixed)
z[fixedIx] <- fixed[fixedIx]
zNotUsed <- setdiff(seq(N), unique(fixed[fixedIx]))
} else {
zNotUsed <- seq(N)
fixedIx <- rep(FALSE, len)
}
# Randomly sample remaining values
zNa <- which(is.na(z))
if (length(zNa) > 0) {
z[zNa] <- sample(zNotUsed, length(zNa), replace = TRUE)
}
# Check to ensure each value is in the vector at least once
missing <- setdiff(seq(N), z)
for (i in missing) {
ta <- sort(table(z[!fixedIx]), decreasing = TRUE)
if (ta[1] == 1) {
stop("'len' is not long enough to accomodate 'N' unique values")
}
ix <- which(z == as.integer(names(ta))[1] & !fixedIx)
z[sample(ix, 1)] <- i
}
return(z)
}
.initializeSplitZ <- function(counts,
K,
KSubcluster = NULL,
alpha = 1,
beta = 1,
minCell = 3) {
s <- rep(1, ncol(counts))
if (is.null(KSubcluster)) {
KSubcluster <- ceiling(sqrt(K))
}
# Initialize the model with KSubcluster clusters
res <- .celda_C(
counts,
K = min(KSubcluster, ncol(counts)),
maxIter = 20,
zInitialize = "random",
alpha = alpha,
beta = beta,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE,
reorder = FALSE
)
overallZ <- as.integer(as.factor(celdaClusters(res)$z))
currentK <- max(overallZ)
counter <- 0
while (currentK < K & counter < 25) {
# Determine which clusters are split-able
# KRemaining <- K - currentK
KPerCluster <- min(ceiling(K / currentK), KSubcluster)
KToUse <- ifelse(KPerCluster < 2, 2, KPerCluster)
zTa <- tabulate(overallZ, max(overallZ))
zToSplit <- which(zTa > minCell & zTa > KToUse)
if (length(zToSplit) > 1) {
zToSplit <- sample(zToSplit)
} else if (length(zToSplit) == 0) {
break
}
# Cycle through each splitable cluster and split it up into
# K.sublcusters
for (i in zToSplit) {
clustLabel <- .celda_C(counts[, overallZ == i, drop = FALSE],
K = KToUse,
zInitialize = "random",
alpha = alpha,
beta = beta,
maxIter = 20,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE,
reorder = FALSE)
tempZ <- as.integer(as.factor(celdaClusters(clustLabel)$z))
# Reassign clusters with label > 1
splitIx <- tempZ > 1
ix <- overallZ == i
newZ <- overallZ[ix]
newZ[splitIx] <- currentK + tempZ[splitIx] - 1
overallZ[ix] <- newZ
currentK <- max(overallZ)
# Ensure that the maximum number of clusters does not get too large'
if (currentK > K + 10) {
break
}
}
counter <- counter + 1
}
# Decompose counts for likelihood calculation
p <- .cCDecomposeCounts(counts, s, overallZ, currentK)
nS <- p$nS
nG <- p$nG
nM <- p$nM
mCPByS <- p$mCPByS
nGByCP <- p$nGByCP
nCP <- p$nCP
nByC <- p$nByC
# Remove clusters 1-by-1 until K is reached
while (currentK > K) {
# Find second best assignment give current assignments for each cell
probs <- .cCCalcEMProbZ(counts,
s = s,
z = overallZ,
K = currentK,
mCPByS = mCPByS,
nGByCP = nGByCP,
nByC = nByC,
nCP = nCP,
nG = nG,
nM = nM,
alpha = alpha,
beta = beta,
doSample = FALSE)
zProb <- t(as.matrix(probs$probs))
zProb[cbind(seq(nrow(zProb)), overallZ)] <- NA
zSecond <- apply(zProb, 1, which.max)
zTa <- tabulate(overallZ, currentK)
zNonEmpty <- which(zTa > 0)
# Find worst cluster by logLik to remove
previousZ <- overallZ
llShuffle <- rep(NA, currentK)
for (i in zNonEmpty) {
ix <- overallZ == i
newZ <- overallZ
newZ[ix] <- zSecond[ix]
p <- .cCReDecomposeCounts(
counts,
s,
newZ,
previousZ,
nGByCP,
currentK)
nGByCP <- p$nGByCP
mCPByS <- p$mCPByS
llShuffle[i] <- .cCCalcLL(
mCPByS,
nGByCP,
s,
newZ,
currentK,
nS,
nG,
alpha,
beta)
previousZ <- newZ
}
# Remove the cluster which had the the largest likelihood after removal
zToRemove <- which.max(llShuffle)
ix <- overallZ == zToRemove
overallZ[ix] <- zSecond[ix]
p <- .cCReDecomposeCounts(counts,
s,
overallZ,
previousZ,
nGByCP,
currentK)
nGByCP <- p$nGByCP[, -zToRemove, drop = FALSE]
mCPByS <- p$mCPByS[-zToRemove, , drop = FALSE]
overallZ <- as.integer(as.factor(overallZ))
currentK <- currentK - 1
}
return(overallZ)
}
.initializeSplitY <- function(counts,
L,
LSubcluster = NULL,
tempK = 100,
beta = 1,
delta = 1,
gamma = 1,
minFeature = 3) {
if (is.null(LSubcluster)) {
LSubcluster <- ceiling(sqrt(L))
}
# Collapse cells to managable number of clusters
if (!is.null(tempK) && ncol(counts) > tempK) {
z <- .initializeSplitZ(counts, K = tempK)
counts <- .colSumByGroup(counts, z, length(unique(z)))
}
# Initialize the model with KSubcluster clusters
res <- .celda_G(counts,
L = LSubcluster,
maxIter = 10,
yInitialize = "random",
beta = beta,
delta = delta,
gamma = gamma,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE,
reorder = FALSE)
overallY <- as.integer(as.factor(celdaClusters(res)$y))
currentL <- max(overallY)
counter <- 0
while (currentL < L & counter < 25) {
# Determine which clusters are split-able
yTa <- tabulate(overallY, max(overallY))
yToSplit <- sample(which(yTa > minFeature & yTa > LSubcluster))
if (length(yToSplit) == 0) {
break
}
# Cycle through each splitable cluster and split it up into
# LSublcusters
for (i in yToSplit) {
# make sure the colSums of subset counts is not 0
countsY <- counts[overallY == i, , drop = FALSE]
countsY <- countsY[, !(colSums(countsY) == 0)]
if (ncol(countsY) == 0) {
next
}
clustLabel <- .celda_G(
countsY,
L = min(LSubcluster, nrow(countsY)),
yInitialize = "random",
beta = beta,
delta = delta,
gamma = gamma,
maxIter = 20,
splitOnIter = -1,
splitOnLast = FALSE,
verbose = FALSE,
reorder = FALSE
)
tempY <- as.integer(as.factor(celdaClusters(clustLabel)$y))
# Reassign clusters with label > 1
splitIx <- tempY > 1
ix <- overallY == i
newY <- overallY[ix]
newY[splitIx] <- currentL + tempY[splitIx] - 1
overallY[ix] <- newY
currentL <- max(overallY)
# Ensure that the maximum number of clusters does not get too large
if (currentL > L + 10) {
break
}
}
counter <- counter + 1
}
## Decompose counts for likelihood calculation
p <- .cGDecomposeCounts(counts = counts, y = overallY, L = currentL)
nTSByC <- p$nTSByC
nByG <- p$nByG
nByTS <- p$nByTS
nGByTS <- p$nGByTS
nM <- p$nM
nG <- p$nG
rm(p)
# Pre-compute lgamma values
lgbeta <- lgamma((seq(0, max(colSums(counts)))) + beta)
lggamma <- lgamma(seq(0, nrow(counts) + L) + gamma)
lgdelta <- c(NA, lgamma(seq(nrow(counts) + L) * delta))
# Remove clusters 1-by-1 until L is reached
while (currentL > L) {
# Find second best assignment give current assignments for each cell
probs <- .cGCalcGibbsProbY(
counts = counts,
y = overallY,
L = currentL,
nTSByC = nTSByC,
nByTS = nByTS,
nGByTS = nGByTS,
nByG = nByG,
nG = nG,
beta = beta,
delta = delta,
gamma = gamma,
lgbeta = lgbeta,
lggamma = lggamma,
lgdelta = lgdelta,
doSample = FALSE)
yProb <- t(probs$probs)
yProb[cbind(seq(nrow(yProb)), overallY)] <- NA
ySecond <- apply(yProb, 1, which.max)
yTa <- tabulate(overallY, currentL)
yNonEmpty <- which(yTa > 0)
# Find worst cluster by logLik to remove
previousY <- overallY
llShuffle <- rep(NA, currentL)
for (i in yNonEmpty) {
ix <- overallY == i
newY <- overallY
newY[ix] <- ySecond[ix]
# Move arounds counts for likelihood calculation
p <- .cGReDecomposeCounts(
counts,
newY,
previousY,
nTSByC,
nByG,
currentL)
nTSByC <- p$nTSByC
nGByTS <- p$nGByTS
nByTS <- p$nByTS
llShuffle[i] <- .cGCalcLL(
nTSByC,
nByTS,
nByG,
nGByTS,
nM,
nG,
currentL,
beta,
delta,
gamma)
previousY <- newY
}
# Remove the cluster which had the the largest likelihood after removal
yToRemove <- which.max(llShuffle)
ix <- overallY == yToRemove
overallY[ix] <- ySecond[ix]
# Move around counts and remove module
p <- .cGReDecomposeCounts(
counts,
overallY,
previousY,
nTSByC,
nByG,
currentL)
nTSByC <- p$nTSByC[-yToRemove, , drop = FALSE]
nGByTS <- p$nGByTS[-yToRemove]
nByTS <- p$nByTS[-yToRemove]
overallY <- as.integer(as.factor(overallY))
currentL <- currentL - 1
}
return(overallY)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.