#' Training scClassify model
#'
#' @param exprsMat_train A matrix of log-transformed expression matrix of reference dataset
#' @param cellTypes_train A vector of cell types of reference dataset
#' @param tree A vector indicates the method to build hierarchical tree,
#' set as "HOPACH" by default.
#' This should be one of "HOPACH" and "HC" (using stats::hclust).
#' @param selectFeatures A vector indicates the gene selection method,
#' set as "limma" by default.
#' This should be one or more of "limma", "DV", "DD", "chisq", "BI", "Cepo".
#' @param topN An integer indicates the top number of features that are selected
#' @param hopach_kmax An integer between 1 and 9 specifying the maximum number of
#' children at each node in the HOPACH tree.
#' @param pSig A numeric indicates the cutoff of pvalue for features
#' @param cellType_tree A list indicates the cell type tree provided by user.
#' (By default, it is NULL)
#' @param weightsCal A logical input indicates whether we need to
#' calculate the weights for the model.
#' @param parallel A logical input indicates whether the algorihms will run in parallel
#' @param BPPARAM A \code{BiocParallelParam} class object
#' from the \code{BiocParallel} package is used. Default is SerialParam().
#' @param verbose A logical input indicates whether the intermediate steps will be printed
#' @param returnList A logical input indicates whether the output will be class of list
#' @param ... Other input for predict_scClassify for the case when weights calculation
#' of the pretrained model is performed
#' @return list of results or an object of \code{scClassifyTrainModel}
#' @author Yingxin Lin
#'
#' @examples
#' data("scClassify_example")
#' xin_cellTypes <- scClassify_example$xin_cellTypes
#' exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
#' trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset,
#' cellTypes_train = xin_cellTypes,
#' selectFeatures = c("limma", "BI"),
#' returnList = FALSE
#' )
#'
#' @importFrom stats na.omit
#' @importFrom methods is
#' @importFrom BiocParallel SerialParam
#' @export
train_scClassify <- function(exprsMat_train,
cellTypes_train,
tree = "HOPACH",
selectFeatures = "limma",
topN = 50,
hopach_kmax = 5,
pSig = 0.05,
cellType_tree = NULL,
weightsCal = FALSE,
parallel= FALSE,
BPPARAM = BiocParallel::SerialParam(),
verbose= TRUE,
returnList = TRUE,
...){
if (is.null(exprsMat_train) | is.null(cellTypes_train)) {
stop("exprsMat_train or cellTypes_train or exprsMat_test is NULL!")
}
# Matching the argument of the tree construction method
tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
# Matching the argument of feature selection method
selectFeatures <- match.arg(selectFeatures,
c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
several.ok = TRUE)
if ("list" %in% is(exprsMat_train)) {
if (sum(unlist(lapply(cellTypes_train, length)) !=
unlist(lapply(exprsMat_train, ncol))) != 0) {
stop("Length of training cell types does not match with
number of column of training expression matrix")
}
}else {
if (length(cellTypes_train) != ncol(exprsMat_train)) {
stop("Length of training cell types does not match with
number of column of training expression matrix")
}
}
# To rename the train list if name is null (only when there are multiple training datasets)
if ( "list" %in% is(exprsMat_train)) {
if (is.null(names(exprsMat_train))) {
names(exprsMat_train) <- names(cellTypes_train) <-
paste("TrainData",
seq_len(length(exprsMat_train)),
sep = "_")
} else if (sum(names(exprsMat_train) == "") != 0) {
names(exprsMat_train)[names(exprsMat_train) == ""] <-
names(cellTypes_train)[names(cellTypes_train) == ""] <-
paste("TrainData", which(names(exprsMat_train) == ""), sep = "_")
}
}
# QC for the training data set
if (any(c("matrix", "dgCMatrix") %in% is(exprsMat_train))) {
zeros <- apply(exprsMat_train, 1, function(x) sum(x == 0)/length(x))
minPctCell <- min(table(cellTypes_train)/length(cellTypes_train))
exprsMat_train <- exprsMat_train[zeros <= max(1 - minPctCell, 0.95), ]
if (verbose) {
cat("after filtering not expressed genes \n")
print(dim(exprsMat_train))
}
} else {
for (train_list_idx in seq_len(length(exprsMat_train))) {
zeros <- apply(exprsMat_train[[train_list_idx]], 1,
function(x) sum(x == 0)/length(x))
minPctCell <- min(table(cellTypes_train[[train_list_idx]])/length(cellTypes_train[[train_list_idx]]))
exprsMat_train[[train_list_idx]] <- exprsMat_train[[train_list_idx]][zeros <= max(1 - minPctCell, 0.95), ]
}
if (verbose) {
cat("after filtering not expressed genes \n")
print(lapply(exprsMat_train, dim))
}
}
### train_scClassify
if ("list" %in% is(exprsMat_train)) {
trainRes <- list()
for (train_list_idx in seq_len(length(exprsMat_train))) {
trainRes[[train_list_idx]] <- train_scClassifySingle(exprsMat_train[[train_list_idx]],
cellTypes_train[[train_list_idx]],
tree = tree,
selectFeatures = selectFeatures,
topN = topN,
hopach_kmax = hopach_kmax,
pSig = pSig,
weightsCal = weightsCal,
parallel = parallel,
BPPARAM = BPPARAM,
verbose = verbose,
...)
}
names(trainRes) <- names(exprsMat_train)
} else{
trainRes <- train_scClassifySingle(exprsMat_train,
cellTypes_train,
tree = tree,
selectFeatures = selectFeatures,
topN = topN,
hopach_kmax = hopach_kmax,
pSig = pSig,
cellType_tree = cellType_tree,
weightsCal = weightsCal,
parallel = parallel,
BPPARAM = BPPARAM,
verbose = verbose,
...)
}
# return the results
if (returnList) {
return(trainRes)
} else {
if ("list" %in% is(exprsMat_train)) {
trainClassList <- list()
for (train_list_idx in seq_len(length(trainRes))) {
trainClassList[[train_list_idx]] <- .scClassifyTrainModel(
name = names(trainRes)[train_list_idx],
cellTypeTree = trainRes[[train_list_idx]]$cutree_list,
cellTypeTrain = as.character(trainRes[[train_list_idx]]$cellTypes_train),
features = names(trainRes[[train_list_idx]]$hierarchyKNNRes),
model = trainRes[[train_list_idx]]$hierarchyKNNRes,
modelweights = trainRes[[train_list_idx]]$modelweights,
metaData = S4Vectors::DataFrame())
}
trainClassList <- scClassifyTrainModelList(trainClassList)
} else {
trainClassList <- .scClassifyTrainModel(
name = "training",
cellTypeTree = trainRes$cutree_list,
cellTypeTrain = as.character(trainRes$cellTypes_train),
features = names(trainRes$hierarchyKNNRes),
model = trainRes$hierarchyKNNRes,
modelweights = trainRes$modelweights,
metaData = S4Vectors::DataFrame())
}
return(trainClassList)
}
}
#' @importFrom BiocParallel SerialParam bplapply
train_scClassifySingle <- function(exprsMat_train,
cellTypes_train,
tree = "HOPACH",
selectFeatures = "limma",
topN = 50,
hopach_kmax = 5,
pSig = 0.05,
cellType_tree = NULL,
weightsCal = FALSE,
parallel= FALSE,
BPPARAM = BiocParallel::SerialParam(),
verbose= TRUE,
...){
if (is.null(rownames(exprsMat_train))) {
stop("rownames of the exprsMat_train is NULL!")
}
if (is.null(rownames(exprsMat_train)) |
sum(duplicated(colnames(exprsMat_train))) != 0) {
stop("colnames of exprsMat_train is NULL or not unique")
}
if (length(cellTypes_train) != ncol(exprsMat_train)) {
stop("Length of training cell types does not match with
number of column of training expression matrix")
}
if (all(exprsMat_train %% 1 == 0)) {
warning("exprsMat_train looks like a count matrix
(scClassify requires a log-transformed normalised input)")
}
# Matching the argument of the tree construction method
tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
# Matching the argument of feature selection method
selectFeatures <- match.arg(selectFeatures,
c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
several.ok = TRUE)
if (verbose) {
print("Feature Selection...")
}
# Select the features to construct tree
tt <- doLimma(exprsMat_train, cellTypes_train)
de <- Reduce(union, lapply(tt, function(t)
rownames(t)[seq_len(max(min(50, sum(t$adj.P.Val < 0.001)), 30))]))
de <- na.omit(de)
if (verbose) {
print(paste("Number of genes selected to construct HOPACH tree",
length(de)))
}
if (is.null(cellType_tree)) {
# Calculate the centroid matrix for tree
# construction using selected features
centroidMat <- do.call(cbind, lapply(unique(cellTypes_train), function(x)
Matrix::rowMeans(as.matrix(exprsMat_train[de, cellTypes_train == x]))))
colnames(centroidMat) <- unique(cellTypes_train)
# Constructing the tree using selected tree method
if (verbose) {
print("Constructing tree ...")
}
cutree_list <- constructTree(centroidMat,
tree = tree,
hopach_kmax = hopach_kmax,
plot = verbose)
} else {
cutree_list <- cellType_tree
}
if (verbose) {
print("Training....")
}
if (parallel) {
hierarchyKNNRes <- BiocParallel::bplapply(seq_len(length(selectFeatures)),
function(ft)
hierarchyKNNcor(exprsMat_train,
cellTypes_train,
cutree_list,
feature = selectFeatures[ft],
topN = topN,
pSig = pSig,
verbose = verbose),
BPPARAM = BPPARAM)
names(hierarchyKNNRes) <- selectFeatures
}else{
hierarchyKNNRes <- list()
for (ft in seq_len(length(selectFeatures))) {
if (verbose) {
print(paste("=== selecting features by:", selectFeatures[ft], "===="))
}
hierarchyKNNRes[[ft]] <- hierarchyKNNcor(exprsMat_train,
cellTypes_train,
cutree_list,
feature = selectFeatures[ft],
topN = topN,
pSig = pSig,
verbose = verbose)
}
names(hierarchyKNNRes) <- selectFeatures
}
trainRes <- list(hierarchyKNNRes = hierarchyKNNRes,
cutree_list = cutree_list,
cellTypes_train = cellTypes_train)
if (weightsCal) {
if (verbose) {
cat("========= SelfTraining to calculate weight ================= \n")
}
selfTrainRes <- predict_scClassify(exprsMat_test = exprsMat_train,
trainRes = trainRes,
cellTypes_test = cellTypes_train,
parallel = parallel,
BPPARAM = BPPARAM,
verbose = verbose,
features = selectFeatures,
...)
trainRes$selfTrainRes <- selfTrainRes
trainRes$modelweights <- getTrainWeights(selfTrainRes)
}
return(trainRes)
}
# Function to construct tree using HOPACH or HC
constructTree <- function(centroidMat,
tree = c("HOPACH", "HC"),
hopach_kmax = hopach_kmax,
plot= TRUE){
tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
if (tree == "HOPACH") {
res <- runHOPACH(data = t(centroidMat),
plot = plot,
kmax = hopach_kmax)
cutree_list <- res$cutree_list
}else{
distMat <- as.dist(1 - stats::cor(centroidMat))
hc <- stats::hclust(distMat, method = "complete")
# plot(hc)
cutree_list <- cutree_iterative(hc, depth = 1)
}
return(cutree_list)
}
# Cut the Hierarchical tree for each level
# depth here indicates the deep the tree is cut
cutree_iterative <- function(hc, depth = 1){
height_sort <- sort(hc$height, decreasing = TRUE)
cutree_list <- list()
for (i in seq_len(sum(height_sort >=
height_sort[round(length(height_sort)*depth)]))) {
cutree_list[[i]] <- cutree(hc, h = height_sort[i])
}
# Last level is distinct number
cutree_list[[length(cutree_list) + 1]] <- seq_len(length(hc$labels))
names(cutree_list[[length(cutree_list)]]) <- hc$labels
return(cutree_list)
}
currentClass <- function(cellTypes, cutree_res){
cellTypes <- as.character(cellTypes)
res <- cutree_res[cellTypes]
return(res)
}
# Function to perform hierarchical feature selection
hierarchyKNNcor <- function(exprsMat,
cellTypes,
cutree_list,
feature = c("limma", "DV", "DD", "chisq", "BI"),
topN = 50,
pSig = 0.001,
verbose= TRUE){
feature <- match.arg(feature, c("limma", "DV", "DD", "chisq", "BI", "Cepo"))
numHierchy <- length(cutree_list)
levelModel <- list()
levelHVG <- list()
for (i in seq_len(numHierchy)) {
#Make sure not all same cluster
if (length(unique(cutree_list[[i]])) != 1) {
model <- list()
hvg <- list()
class_tmp <- currentClass(cellTypes, cutree_list[[i]])
names(class_tmp) <- colnames(exprsMat)
for (j in unique(cutree_list[[i - 1]])) {
# print(paste("Group", j))
trainIdx <- which(cellTypes %in% names(cutree_list[[i - 1]])[cutree_list[[i - 1]] == j])
trainClass <- class_tmp[trainIdx]
if (length(unique(trainClass)) != 1) {
hvg[[j]] <- featureSelection(exprsMat[,trainIdx],
trainClass,
feature = feature,
topN = topN,
pSig = pSig
)
model[[j]] <- list(train = Matrix::t(exprsMat[na.omit(hvg[[j]]),
trainIdx,
drop = FALSE]),
y = as.factor(trainClass))
}else{
model[[j]] <- "noModel"
}
}
levelHVG[[i]] <- hvg
levelModel[[i]] <- model
}
}
res <- list(model = levelModel, hvg = levelHVG)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.