# @title Handle directory creation
#
# @description Given the output directory name with its complete path, this
# function checks if a directory of same name exists the given location. If
# yes, then it adds a suffix (a number) to the given directory name, and
# proceeds to create the directory.
#
# @param o_dir Specify the output directory name with its complete path.
#
# @param vrbs Set verbosity to TRUE or FALSE
#
# @return The (updated) dir name
#
#
handle_dir_creation <- function(o_dir, vrbs){
##
cli::cli_alert_info(c("Output directory at path: ",
"{.emph {dirname(o_dir)}}"))
if(dir.exists(o_dir)){
cli::cli_alert_warning("Directory exists: {.emph {basename(o_dir)}}")
allExistingDirs <- list.dirs(path = dirname(o_dir),
recursive = FALSE)
dirsThatMatch <- grep(pattern = basename(o_dir), allExistingDirs,
value = TRUE)
## Suffix an integer, because directory with given name (oDir)
## exists for length(dirsThatMatch) times
name_suffix <- length(dirsThatMatch)
o_dir <- paste(o_dir, name_suffix, sep = "_")
while(dir.exists(o_dir)){
name_suffix <- name_suffix + 1
o_dir <- paste(o_dir, name_suffix, sep = "_")
}
}
retVal <- dir.create(o_dir, showWarnings = TRUE)
stopifnot(retVal)
cli::cli_alert_success("Writing to: {.emph {basename(o_dir)}}")
o_dir
}
## =============================================================================
.check_list_prop <- function(listVar){
returnVal <- .assert_seqArchR_list_properties(listVar)
if (returnVal != "FOO") stop(returnVal)
}
## =============================================================================
## Getter function to fetch the features matrix from NMF result object
## (from python)
## Dependency on python script perform_nmf.py
get_features_matrix <- function(nmfResultObj){
.check_list_prop(nmfResultObj)
return(as.matrix(nmfResultObj[[1]]))
}
## =============================================================================
## Getter function to fetch the samples matrix from NMF result object
## (from python)
## Dependency on python script perform_nmf.py
get_samples_matrix <- function(nmfResultObj){
.check_list_prop(nmfResultObj)
return(as.matrix(nmfResultObj[[2]]))
}
## =============================================================================
get_trimers_from_alphabet <- function(alph){
if (is.null(alph)) stop("Expecting non-NULL alphabet")
return(do.call(paste0, expand.grid(alph, alph, alph)))
}
## =============================================================================
get_dimers_from_alphabet <- function(alph){
if (is.null(alph)) stop("Expecting non-NULL alphabet")
return(do.call(paste0, expand.grid(alph, alph)))
}
## =============================================================================
manage_o_dir <- function(plt, o_dir){
if(plt){
if(is.null(o_dir)){
stop("'plot' flag is TRUE but 'o_dir' is not provided. ",
"Did you forget to set 'o_dir'?")
}
}
}
## =============================================================================
# @importFrom Biostrings width
plot_all_seqs_logo <- function(seqs_raw, seqs_pos, dpath){
if(is.null(seqs_raw)) stop("seqs_raw is NULL")
if(is.null(dpath)) stop("directory path/name is NULL")
if(is.null(seqs_pos)) seqs_pos <- seq(1, Biostrings::width(seqs_raw[1]))
allSequencesLogo <- plot_ggseqlogo_of_seqs(
seqs = seqs_raw,
pos_lab = seqs_pos,
title = paste("Sequence logo of all", length(seqs_raw),"sequences" ))
##
ggsave(filename = file.path(dpath, "allSequencesLogo.pdf"),
plot = allSequencesLogo,
device = "pdf", width = 20, height = 2.5)
}
## =============================================================================
#' @title
#' Set seqArchR run configuration
#'
#' @description This function sets the configuration for `seqArchR`.
#'
#' @param chunk_size Numeric. Specify the size of the inner chunks of
#' sequences.
#' @param k_min Numeric. Specify the minimum of the range of values to be tested
#' for number of NMF basis vectors. Default is 1.
#' @param k_max Numeric. Specify the maximum of the range of values to be tested
#' for number of NMF basis vectors. Default is 50.
#' @param mod_sel_type Character. Specify the model selection strategy to
#' be used. Default is 'stability'. Another option is 'cv', short for
#' cross-validation. Warning: The cross-validation approach can be time
#' consuming and computationally expensive than the stability-based approach.
#' @param bound Numeric. Specify the lower bound value as criterion for choosing
#' the most appropriate number of NMF factors. Default is 1e-08.
#' @param cv_folds Numeric. Specify the number of cross-validation folds used
#' for model selection. Only used when mod_sel_type is set to 'cv'. Default
#' value is 5.
#' @param parallelize Logical. Specify whether to parallelize the procedure.
#' Note that running seqArchR serially can be time consuming, especially when
#' using cross-validation for model selection. See `n_cores`.
#' Consider parallelizing with at least 2 or 4 cores.
#' @param n_cores The number of cores to be used when `parallelize` is set
#' to TRUE. If `parallelize` is FALSE, nCores is ignored.
#' @param n_runs Numeric. Specify the number of bootstrapped runs
#' to be performed with NMF. Default value is 100. When using cross-validation
#' more than 100 iterations may be needed (upto 500).
#' @param alpha_base,alpha_pow Specify the base and the power for computing
#' 'alpha' in performing model selection for NMF. alpha = alpha_base^alpha_pow.
#' Alpha specifies the regularization for NMF. Default: 0 and 1 respectively.
#' _Warning_: Currently, not used (for future).
#' @param min_size Numeric. Specify the minimum number of sequences, such that
#' any cluster/chunk of size less than or equal to it will not be further
#' processed. Default is 25.
#' @param result_aggl Character. Specify the agglomeration method to be used
#' for final result collation with hierarchical clustering. Default is
#' 'complete' linkage. Possible values are those allowed with
#' \code{\link[stats]{hclust}}. Also see details below.
#' @param result_dist Character. Specify the distance method to be used for
#' final result collation with hierarchical clustering. Default is 'cor' for
#' correlation. Possible values are those allowed with
#' \code{\link[stats]{hclust}}. Also see details below.
#' @param checkpointing Logical. Specify whether to write intermediate
#' checkpoints to disk as RDS files. Checkpoints and the final result are
#' saved to disk provided the `o_dir` argument is set in \code{\link{seqArchR}}.
#' When `o_dir` argument is not provided or NULL, this is ignored.
#' Default is TRUE.
#' @param flags List with four logical elements as detailed.
#' \describe{
#' \item{debug}{Whether debug information for the run is printed}
#' \item{verbose}{Whether verbose information for the run is printed}
#' \item{plot}{Whether verbose plotting is performed for the run}
#' \item{time}{Whether timing information is printed for the run}
#' }
#'
#' @details Setting suitable values for the following parameters is dependent
#' on the data: 'inner_chunk_size', 'k_min', 'k_max', 'mod_sel_type',
#' 'min_size', 'result_aggl', 'result_dist'.
#'
#'
#' @return A list with all params for seqArchR set
#'
#' @examples
#' # Set seqArchR configuration
#' seqArchRconfig <- seqArchR::set_config(
#' chunk_size = 100,
#' parallelize = TRUE,
#' n_cores = 2,
#' n_runs = 100,
#' k_min = 1,
#' k_max = 20,
#' mod_sel_type = "stability",
#' bound = 10^-8,
#' flags = list(debug = FALSE, time = TRUE, verbose = TRUE,
#' plot = FALSE)
#' )
#'
#'
#' @export
set_config <- function(chunk_size = 500,
k_min = 1,
k_max = 50,
mod_sel_type = "stability",
bound = 10^-6,
cv_folds = 5,
parallelize = FALSE,
n_cores = NA,
n_runs = 100,
alpha_base = 0,
alpha_pow = 1,
min_size = 25,
result_aggl = "complete",
result_dist = "euclid",
checkpointing = TRUE,
flags = list(
debug = FALSE,
time = FALSE,
verbose = TRUE,
plot = FALSE)
) {
## Configuration Params that can be set by user
seqArchRconfig <- NULL
##
if(is.null(flags)){
useFlags <- list(
debugFlag = FALSE,
timeFlag = FALSE,
verboseFlag = TRUE,
plotVerboseFlag = FALSE)
}else{
useFlags <- list(
debugFlag = flags$debug,
timeFlag = flags$time,
verboseFlag = flags$verbose,
plotVerboseFlag = flags$plot)
}
##
seqArchRconfig <- list(modSelType = mod_sel_type,
bound = bound,
kFolds = cv_folds,
parallelize = parallelize,
nCoresUse = n_cores,
nRunsUse = n_runs,
paramRanges = list(
alphaBase = alpha_base,
alphaPow = alpha_pow,
k_vals = seq(k_min, k_max, by = 1)
),
chunkSize = chunk_size,
result_aggl = result_aggl,
result_dist = result_dist,
checkpointing = checkpointing,
minSeqs = min_size,
flags = useFlags
)
.assert_seqArchR_config(seqArchRconfig)
return(seqArchRconfig)
}
## =============================================================================
## @title Decide processing of outer chunk based on its size
##
## @description Function to make the decision on whether the given (outer)
## chunk should be processed
##
## @param minThreshold Numeric. This is the minSeqs param from seqArchR config
## @param lengthOfOC Numeric. This is the length of the outer chunk
## @param kFoldsVal Numeric. This is the kFolds param in seqArchR config
##
## If the lengthOfOC == 0, STOP
## If the minThreshold < 4*kFolds, STOP
## If the lengthOfOC < minThreshold, DO NOT PROCESS/return TRUE
##
.decide_process_outer_chunk <- function(minThreshold, lengthOfOC, kFoldsVal) {
stopifnot(lengthOfOC > 0)
stopifnot(minThreshold > 0)
## Assert that minThreshold > 4*kFoldsVal
nFoldsCondition <- 4 * kFoldsVal
.assert_seqArchR_min_size_independent(minThreshold)
if (minThreshold < nFoldsCondition) {
stop("'min_size' should be at least 4 times 'kFolds'")
}
##
doNotProcess <- FALSE
if (lengthOfOC <= minThreshold) {
doNotProcess <- TRUE
.msg_pstr("Sorry, will not process this small a chunk: ",
lengthOfOC, flg=TRUE)
}
##
return(doNotProcess)
}
## =============================================================================
## @param factorsMat A matrix holding the factors along the columns
## @param distMethod character A string specifying the distance measure to
## be computed. Values are: 'modNW' for modified Needleman Wunsch, and any
## distance measure that is possible with HOPACH
##
## Default value 'modNW'
## Edit on 2021-01-02:
## Default value changed to euclid instead of modNW which is computed using
## a suggested package
## Edit on 2022-04-01:
## modNW removed. May not be required at all
##
## @return distance matrix from hopach (hdist object)
.compute_factor_distances <- function(factorsMat, distMethod = "euclid"){
## Assumption: Each column is a factor
.assert_seqArchR_featSampMatrix(factorsMat, feat = TRUE)
## Since the default distMethod is euclid/euclidean, when the user wishes
## to use any other distance methods/metrics, we can check if hopach exists
## If not, we ask the user to install it.
## This lets move hopach to Suggests
distMethods_hopach <- c("cosangle", "abscosangle",
"abseuclid", "cor", "abscor")
distMethods_stats <- c("euclid")
##
if(any(distMethod == distMethods_hopach)){
distMat <- .get_hopach_dist(factorsMat, distMethod)
return(distMat)
}
if(distMethod == distMethods_stats){
distMat <- .get_stats_dist(factorsMat)
return(distMat)
}
}
## =============================================================================
.get_hopach_dist <- function(factorsMat, distMethod){
if(!requireNamespace("hopach", quietly = TRUE)){
stop("Please install R package 'hopach' to use ", distMethod,
" distance.")
}else{
## - these distance metrics are available in hopach pkg
## - hopach::distancematrix func requires vectors along rows.
## - Distances are computed between row vectors
if (nrow(factorsMat) > ncol(factorsMat)){
factorsMat <- t(factorsMat)
}
hopachDist <- hopach::distancematrix(factorsMat, d = distMethod)
## hopachDistMat is a hopach hdist object
stopifnot(length(hopachDist) == nrow(factorsMat))
## make as.matrix as done for dist object in the
## stats::dist case (see Else condition next)
hopachDistMat <- hopach::as.matrix(hopachDist)
return(hopachDistMat)
}
}
## =============================================================================
.get_stats_dist <- function(factorsMat){
## dist method from stats // standard
if (nrow(factorsMat) > ncol(factorsMat)) factorsMat <- t(factorsMat)
as_dist <- stats::dist(factorsMat, method = "euclidean")
distMat <- as.matrix(as_dist)
return(distMat)
}
## For hierarchical clustering object, return the cluster medoids
## We currently use the first element of the cluster as its medoid
.get_factors_from_factor_clustering2 <- function(listObj, globFactorsMat){
##
.assert_seqArchR_featSampMatrix(globFactorsMat, feat = TRUE)
if (is.null(listObj)) {
return(globFactorsMat)
} else {
medoids <- unlist(lapply(listObj, function(x){x[1]}))
return(as.matrix(globFactorsMat[ , medoids]))
}
}
## =============================================================================
## @title Process a chunk wih NMF
##
##
## On the given inner chunk,
## 1. perform model selection for #factors for NMF
## 2. Perform final NMF with chosen best_k (#Factors)
## 3. Store factors (globFactors)
## 4. Fetch clusters using k-means clustering
## - assign clusters <--> factors
## 5. Store cluster assignments (globClustAssignments)
## 6. Return updated globFactors, globClustAssignments
##
## - innerChunkIdx is needed to appropriately index into the global variables:
## globFactors and globClustAssignments
## - globClustAssignments variable is updated inside the function to
## additionally hold new ones
## - Similarly, globFactors variable is updated inside the function to
## additionally hold new ones
##
##
.handle_chunk_w_NMF2 <- function(innerChunkIdx,
innerChunksColl,
this_mat,
cgfglinear = TRUE,
coarse_step = 10,
askParsimony = TRUE,
config, test_itr, oChunkIdx,
bpparam){
.assert_seqArchR_flags(config$flags)
dbg <- config$flags$debugFlag
vrbs <- config$flags$verboseFlag
tym <- config$flags$timeFlag
##
if (is.null(this_mat) || !is.matrix(this_mat) &&
!is(this_mat, "dgCMatrix")) {
stop("Input matrix to model selection procedure is NULL or not a
matrix")
}
#########################
if(config$modSelType == "cv"){
suffix_str <- ", without parsimony"
if(askParsimony) suffix_str <- ", with parsimony"
.msg_pstr("Performing cross validation-based model selection",
suffix_str, flg=dbg)
best_k <- .cv_model_select_pyNMF2(
X = this_mat, param_ranges = config$paramRanges,
kFolds = config$kFolds, nRuns = config$nRunsUse,
verboseFlag = config$flags$verboseFlag,
debugFlag = config$flags$debugFlag,
returnBestK = TRUE, cgfglinear = cgfglinear,
coarse_step = coarse_step,
askParsimony = askParsimony, bpparam = bpparam
)
}
#########################
if(config$modSelType == "stability"){
.msg_pstr("Performing stability-based model selection", flg=dbg)
best_k <- .stability_model_select_pyNMF2(
X = this_mat, param_ranges = config$paramRanges,
nRuns = config$nRunsUse, bound = config$bound,
flags = config$flags, returnBestK = TRUE, bootstrap = TRUE,
bpparam = bpparam
)
}
#########################
if (best_k == max(config$paramRanges$k_vals)) {
warning(c("Best K for this subset == 'k_max'. ",
"Consider selecting a larger 'k_max' value, or\n",
"smaller chunk_size, or\n",
"perhaps, further increasing 'n_runs'\n"),
immediate. = TRUE)
}
cli::cli_alert_info("Best K for this chunk: {best_k}")
##
if (best_k >= 1) {
## For fetching sequence clusters from samplesMat
## Cluster sequences
## New strategy, perform nRuns for bestK and use only the best one
nRuns <- config$nRunsUse
featuresMatrixList <- vector("list", nRuns)
samplesMatrixList <- vector("list", nRuns)
new_ord <- vector("list", nRuns)
nmf_nRuns_list <- .perform_multiple_NMF_runs(X = this_mat,
kVal = best_k,
alphaVal = 0,
nRuns = nRuns,
bootstrap = TRUE, bpparam = bpparam)
featuresMatrixList <- lapply(nmf_nRuns_list$nmf_result_list,
get_features_matrix)
samplesMatrixList <- lapply(nmf_nRuns_list$nmf_result_list,
get_samples_matrix)
##
new_ord <- nmf_nRuns_list$new_ord
## Get reconstruction accuracies (q2) for them
## assume this_Mat is A, recA is reconstructedA
recA_list <- lapply(seq_len(nRuns), function(nR){
as.matrix(featuresMatrixList[[nR]]) %*%
as.matrix(samplesMatrixList[[nR]])
})
q2_nRuns <- unlist(lapply(recA_list, function(recA){
.compute_q2(as.matrix(this_mat), recA)
}))
bestQ2_run <- which.max(q2_nRuns)
bestFeatMat <- featuresMatrixList[[bestQ2_run]]
bestSampMat <- samplesMatrixList[[bestQ2_run]]
bestOrd <- new_ord[[bestQ2_run]]
##
featuresMatrix <- bestFeatMat
samplesMatrix <- bestSampMat
## When order was changing:
## put samples matrix back in order it should be
tempM <- bestSampMat
samplesMatrix <- matrix(rep(NA, length(tempM)), nrow = nrow(tempM))
samplesMatrix[ ,bestOrd] <- tempM
#####
clusterMembershipsForSamples <-
.get_cluster_memberships_per_run(samplesMatrix = samplesMatrix,
iChunksColl = innerChunksColl, iChunkIdx = innerChunkIdx,
test_itr, oChunkIdx)
##
## Could handle overfitting here
if(best_k > 1){
has_overfit <- .detect_overfitting(samplesMatrix,
clusterMembershipsForSamples,
minSeqs = 50)
##
## -- Note which clusters are overfit
## -- Remove those columns from featuresMat
## -- Adjust clustMemberships
if(length(has_overfit) > 0){
clusterMembershipsForSamples <-
.adjustSampleMemberships(clusterMembershipsForSamples,
samplesMatrix, has_overfit)
featuresMatrix <- as.matrix(featuresMatrix[, -c(has_overfit)])
best_k <- best_k - length(has_overfit)
cli::cli_alert_info(c("Adjusting for overfitting, ",
"fetched {best_k} cluster{?s}"))
stopifnot(best_k == ncol(featuresMatrix))
stopifnot(best_k ==
length(unique(clusterMembershipsForSamples)))
if(best_k == 1){
stopifnot(unique(clusterMembershipsForSamples) == 1)
}
}
}
##
forGlobClustAssignments <- .assign_samples_to_clusters(
clusterMembershipsVec = clusterMembershipsForSamples,
nClusters = best_k, iChunkIdx = innerChunkIdx,
iChunksColl = innerChunksColl)
##
} else if (best_k < 1) {
stop("Chosen number of factors: ", best_k)
}
.assert_seqArchR_featSampMatrix(featuresMatrix, feat = TRUE)
.assert_seqArchR_globClustAssignments(forGlobClustAssignments)
innerChunkNMFResult <- list(forGlobFactors = featuresMatrix,
forGlobClustAssignments =
forGlobClustAssignments)
##
return(innerChunkNMFResult)
}
## =============================================================================
# .getMeanOfListOfMatrices <- function(listOfMats) {
# # Currently, assume all matrices have same dimensions
# # if discrepancy in dimensions, throws error "non-conformable arrays"
# meanMat <- Reduce("+", listOfMats)/length(listOfMats)
# return(meanMat)
# }
## @title Setup the clustFactors list element for seqArchR result object
##
## @description Function to set up the clustFactors variable for seqArchR result
## object. Having a separate dedicated function enables seamless future changes.
##
## @param intClustFactors A matrix holding all factors from the just concluded
## iteration along the columns.
##
## @return A list with 2 elements having fixed element names. They are
## 'nBasisVectors': This is the number of basis vectors (for easy info access)
## and
## 'basisVectors': This is the matrix of basis vectors (intClustFactors itself)
.setup_clustFactors_for_seqArchR_result <- function(intClustFactors) {
returnList <- list(nBasisVectors = ncol(intClustFactors),
basisVectors = intClustFactors)
return(returnList)
}
## =============================================================================
intermediateResultsPlot <- function(seq_lab, seqs_raw = NULL,
pos_lab = NULL, iter = 0, fname = NULL,
name_suffix = NULL,
vrbs = TRUE){
## This function plots and prints resulting clusters -- the sequence image
## matrix (PNG file) and the sequence logos (PDF file).
if(is.null(name_suffix)){
if(is.numeric(iter)){
name_suffix <- paste0("Iteration", iter)
cli::cli_h2("Intermediate result")
}else{
name_suffix <- paste0("Final")
cli::cli_rule(left="Final result")
}
}else{
.msg_pstr("=== On-demand Result ===", flg=vrbs)
}
##
cli::cli_alert_info("Output directory: {.emph {fname}}")
seqs_clust_list_ord <- get_seqs_clust_list(seq_lab)
seqs_clust_vec_ord <- unlist(seqs_clust_list_ord)
image_fname <- file.path(fname,
paste0("ClusteringImage_", name_suffix, ".png"))
cli::cli_alert_info(c("Sequence clustering image written to: ",
"{.emph {basename(image_fname)}}"))
viz_seqs_acgt_mat(
seqs = as.character(seqs_raw[seqs_clust_vec_ord]),
pos_lab = pos_lab,
save_fname = image_fname,
f_width = 450,
f_height = 900,
xt_freq = 5,
yt_freq = 100)
arch_fn <- file.path(fname,
paste0("Architecture_SequenceLogos_", name_suffix, ".pdf"))
cli::cli_alert_info(c("Architectures written to: ",
"{.emph {basename(arch_fn)}}"))
plot_arch_for_clusters(seqs = seqs_raw,
clust_list = seqs_clust_list_ord,
pos_lab = pos_lab,
pdf_name = arch_fn)
}
## =============================================================================
save_final_result <- function(o_dir, temp_seqArchRresult){
if(!is.null(o_dir)){
rdsFilename <- file.path(o_dir, "seqArchRresult.rds")
saveRDS(temp_seqArchRresult, file=rdsFilename)
cli::cli_alert_info("Result saved to: {.emph {basename(rdsFilename)}}")
}
}
## =============================================================================
save_checkpoint <- function(o_dir, test_itr, threshold_itr,
seqsClustLabelsList, clustFactors,
seqs_raw, config, call = NULL){
##
if(!is.null(o_dir) && test_itr != threshold_itr){
##
curr_seqArchRresult <- list(seqsClustLabels = seqsClustLabelsList,
clustBasisVectors = clustFactors,
rawSeqs = seqs_raw,
config = config,
call = call)
rdsFilename <- file.path(o_dir, paste0("seqArchRresult_checkpoint",
test_itr, ".rds"))
saveRDS(curr_seqArchRresult, file=rdsFilename)
cli::cli_alert_info(c("Checkpointing at iteration {test_itr}: ",
"{basename(rdsFilename)}"))
}
}
## =============================================================================
show_ellapsed_time <- function(use_str = "Time ellapsed since start: ",
use_time){
complTime1 <- as.difftime(Sys.time() - use_time)
cli::cli_alert(
c(use_str, "{prettyunits::pretty_dt(complTime1)}"))
as.double.difftime(complTime1)
}
## =============================================================================
decisionToCollate <- function(clustFactors, dbg){
decisionToCollate <- TRUE
iterations <- length(clustFactors)
if(iterations > 1){
prevIterNFactors <- clustFactors[[iterations-1]]$nBasisVectors
currIterNFactors <- clustFactors[[iterations]]$nBasisVectors
if(currIterNFactors == prevIterNFactors){
decisionToCollate <- FALSE
.msg_pstr("Reordering decision: FALSE", flg=dbg)
}
}
decisionToCollate
}
## =============================================================================
keepMinClusters <- function(set_ocollation, temp_res = NULL,
totOuterChunksColl = NULL, test_itr = 1,
nClustEachOC = NULL, nClustEachIC = NULL,
dbg, clustFactors = NULL, stage = "Final"){
if(!is.null(stage) && stage == "Final" && test_itr > 1){
setMinClustersFinal <- 2 ## the default value
## For setting minClusters, note last iteration collated
if(any(set_ocollation)){
lastItrC <- tail(which(set_ocollation), 1)
setMinClustersFinal <- get_clBasVec_k(temp_res, lastItrC)
}
return(setMinClustersFinal)
}
if(totOuterChunksColl > 1){
.msg_pstr("meanClustersOC: ", ceiling(mean(nClustEachOC)), flg=dbg)
## For setting minClusters, note last iteration collated
chkIdx <- seq_len(test_itr-1)
if(any(set_ocollation[chkIdx])){
lastItrC <- tail(which(set_ocollation[chkIdx]), 1)
setMinClusters <- clustFactors[[lastItrC]]$nBasisVectors
}else{
## average clusters identified in each chunk of the 1st iter
setMinClusters <- max(ceiling(mean(nClustEachOC[1])), 2)
}
}else{
## When totOuterChunks is == 1, this is the first iteration
## Use the mean of nClustEachIC
.msg_pstr("meanClustersIC: ", ceiling(mean(nClustEachIC)), flg=dbg)
setMinClusters <- max(ceiling(mean(nClustEachIC)), 2)
}
return(setMinClusters)
}
## =============================================================================
perform_setup <- function(config, total_itr, o_dir, fresh,
seqs_pos, seqs_raw, seqs_ohe_mat,
set_ocollation){
dbg <- config$flags$debugFlag
vrbs <- config$flags$verboseFlag
plt <- config$flags$plotVerboseFlag
crs <- config$nCoresUse
parallelize <- config$parallelize
modSelType <- config$modSelType
bound <- config$bound
## assert total_itr is a positive integer
.assert_seqArchR_thresholdIteration(total_itr)
## TODO Provide a summary function
if(!is.null(o_dir)){
if(fresh){
o_dir <- handle_dir_creation(o_dir, vrbs||dbg)
}else{
if(!dir.exists(o_dir)){
stop(o_dir, " not found")
}
}
}
##
if(is.null(seqs_pos)){
seqs_pos <- seq_len(Biostrings::width(seqs_raw[1]))
}
##
if(plt){
manage_o_dir(plt, o_dir) # this will stop if o_dir is NULL
plot_all_seqs_logo(seqs_raw, seqs_pos, dpath=o_dir)
}
## Make checks for params in configuration
.assert_seqArchR_config(config, seqs_size = ncol(seqs_ohe_mat))
.assert_seqArchR_thresholdIteration(total_itr)
.assert_seqArchR_collation(set_ocollation, total_itr)
if(length(set_ocollation) > total_itr){
set_ocollation <- set_ocollation[seq(1,total_itr)]
cli::cli_alert_info(c("Changing length of 'set_ocollation' to ",
"same as that of total_itr"))
}
#### Start cluster only once -- using BiocParallel
if(parallelize){
if(.Platform$OS.type == "windows"){
if (is.null(crs)) crs <- BiocParallel::multicoreWorkers()
cl <- BiocParallel::SnowParam(workers = crs, tasks = crs,
exportglobals = TRUE)
cli::cli_alert_info("Parallelization: Yes")
}else{
if (is.null(crs)) crs <- BiocParallel::multicoreWorkers()
cl <- BiocParallel::MulticoreParam(workers = crs, tasks = crs)
cli::cli_alert_info("Parallelization: Yes")
}
}else{
cl <- BiocParallel::SerialParam()
cli::cli_alert_info("Parallelization: No")
}
##
if(modSelType != "cv" && modSelType != "stability")
cli::cli_alert_warning(c("Mis-specified model selection strategy",
"Using factor stability for model selection"))
msg_suffix <- ifelse(modSelType == "cv", "cross-validation",
"factor stability")
cli::cli_alert_info("Model selection by {msg_suffix}")
if(modSelType == "stability") cli::cli_alert_info("Bound: {bound}")
set_parsimony <- NULL
if(modSelType == "cv"){
## Specify set_parsimony when cv
set_parsimony <- rep(TRUE, total_itr)
}
return(list(cl = cl, o_dir = o_dir, seqs_pos = seqs_pos,
set_parsimony = set_parsimony))
}
## =============================================================================
process_innerChunk <- function(test_itr, innerChunksColl, config, lenOC,
seqs_ohe_mat, set_parsimony, outerChunkIdx,
bpparam){
nmfResultEachIC <- lapply(seq_along(innerChunksColl), function(x){
innerChunkIdx <- x
cli::cli_h3(c("Inner chunk {innerChunkIdx} of ",
"{length(innerChunksColl)} ",
"[Size: {length(innerChunksColl[[innerChunkIdx]])}]"))
##
## Setting up sequences for the current chunk
this_seqsMat <-
seqs_ohe_mat[, innerChunksColl[[innerChunkIdx]]]
##
chnksz <- config$chunkSize
cvStep <- ifelse(test_itr == 1 || lenOC > 0.9*chnksz, 10, 5)
thisNMFResult <- .handle_chunk_w_NMF2(innerChunkIdx,
innerChunksColl, this_seqsMat,
cgfglinear = TRUE, coarse_step = cvStep,
askParsimony = set_parsimony[test_itr],
config = config, test_itr = test_itr,
oChunkIdx = outerChunkIdx, bpparam = bpparam)
thisNMFResult
})
assertions <- lapply(nmfResultEachIC, .assert_seqArchR_NMFresult)
globFactors <- lapply(nmfResultEachIC, function(x){x$forGlobFactors})
globClustAssignments <- lapply(nmfResultEachIC, function(x){
x$forGlobClustAssignments})
nClustEachIC <- unlist(lapply(globClustAssignments, length))
return(list(globFactors = globFactors,
globClustAssignments = globClustAssignments,
nClustEachIC = nClustEachIC))
}
## =============================================================================
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.