#' Random Forest classifier for supervised demarcation of groups using flow cytometry data.
#'
#' @param x flowSet object where the necessary metadata for classification is
#' included in the phenoData.
#' @param sample_info Sample information necessary for the classification, has to
#' contain a column named "name"
#' which matches the samplenames of the FCS files stored in the flowSet.
#' @param sample_col Column name of the sample names in sample_info. Defaults to "name".
#' @param target_label column name of the sample_info dataframe that should be
#' predicted based on the flow cytometry data.
#' @param downsample Indicate to which sample size should be downsampled.
#' By default samples are downsampled to the sample size of the sample with the
#' lowest number of cells.
#' Defaults to sample level.
#' @param param Parameters to base classification on.
#' @param p_train Percentage of the data set that should be used for training the model.
#' @param seed Set random seed to be used during the analysis. Put at 777 by default.
#' @param cleanFCS Indicate whether outlier removal should be conducted prior to model estimation.
#' Defaults to FALSE. I would recommend to make sure samples have > 500 cells. Will denoise based on the parameters specified in `param`.
#' @param timesplit Fraction of timestep used in flowAI for denoising. Please consult the `flowAI::flow_auto_qc` function for more information.
#' @param TimeChannel Name of time channel in the FCS files. This can differ between flow cytometers. Defaults to "Time". You can check this by: colnames(flowSet).
#' @param plot_fig Should the confusion matrix and the overall performance statistics on the test data partition be visualized?
#' Defaults to FALSE.
#' @param method method used by caret::train for learning (defaults to Random forests)
#' @param classification_type whether to perform sample-level or single-cell level classification (defaults to sample-level)
#' @importFrom BiocGenerics unique colnames
#' @importFrom flowAI flow_auto_qc
#' @importFrom caret trainControl createDataPartition train confusionMatrix
#' @keywords random forest, fcm
#' @examples
#'
#' # 1. Example with environmental data:
#'
#' # Load raw data (imported using flowCore)
#' data(flowData)
#'
#' # Format necessary metadata
#' metadata <- data.frame(names = flowCore::sampleNames(flowData),
#' do.call(rbind, lapply(strsplit(flowCore::sampleNames(flowData),"_"), rbind)))
#' colnames(metadata) <- c("Sample_names", "Cycle_nr", "Location", "day",
#' "timepoint", "Staining", "Reactor_phase", "replicate")
#'
#' # Run Random Forest classifier to predict the Reactor phase based on the
#' # single-cell FCM data
#' model_rf <- RandomF_FCS(flowData, sample_info = metadata[1:10, ], sample_col = "Sample_names",
#' target_label = "Reactor_phase",
#' downsample = 10)
#'
#' # Make a model prediction on new data and report contigency table of predictions
#' model_pred <- RandomF_predict(x = model_rf[[1]], new_data = flowData[1], cleanFCS = FALSE)
#' print(model_pred)
#'
#' # 2. Example with synthetic community data
#' # Load flow cytometry data of two strains with each 5,000 cells measured
#' data(flowData_ax)
#'
#' # Quickly generate the necesary metadata
#' metadata_syn <- data.frame(name = flowCore::sampleNames(flowData_ax),
#' labels = flowCore::sampleNames(flowData_ax))
#'
#' # Run Random forest model on 100 cells of each strain
#' model_rf_syn <-
#' RandomF_FCS(
#' flowData_ax,
#' sample_info = metadata_syn,
#' sample_col = "name",
#' target_label = "labels",
#' downsample = 100,
#' plot_fig = TRUE
#' )
#'
#' # Make predictions on each of the samples or on new data of the mixed communities
#' model_pred_syn <- RandomF_predict(x = model_rf_syn[[1]], new_data = flowData_ax, cleanFCS = FALSE)
#' print(model_pred_syn)
#' @export
RandomF_FCS <- function(x,
sample_info,
sample_col = "name",
target_label,
downsample = 0,
classification_type = "sample",
param = c("FL1-H", "FL3-H", "FSC-H", "SSC-H"),
p_train = 0.75,
seed = 777,
cleanFCS = FALSE,
timesplit = 0.1,
TimeChannel = "Time",
plot_fig = FALSE,
method = "rf") {
# Set seed
set.seed(seed)
if(cleanFCS == TRUE){
cat(paste0("-------------------------------------------------------------------------------------------------", "\n"))
cat(date(), paste0("--- Using the following parameters for removing errant collection events\n in samples:\n \n"))
cat(paste0(param),"\n \n")
cat(date(), paste0("--- Scatter parameters will be automatically excluded", "\n"))
cat(paste0("-------------------------------------------------------------------------------------------------"))
cat("\n", paste0("Please cite:", "\n"))
cat("\n", paste0("Monaco et al., flowAI: automatic and interactive anomaly discerning tools for flow cytometry data,\n Bioinformatics, Volume 32, Issue 16, 15 August 2016, Pages 2473-2480, \n https://doi.org/10.1093/bioinformatics/btw191", "\n"))
cat(paste0("-------------------------------------------------------------------------------------------------", "\n \n"))
# Subset flowset to only samples in sample data
x <- x[sample_info[, sample_col]]
# Extract sample names
sam_names <- flowCore::sampleNames(x)
# Extract parameters not to base denoising on
param_f <- BiocGenerics::unique(gsub(param, pattern = "-H|-A|-W",
replacement = ""))
filter_param <- BiocGenerics::colnames(x)
filter_param <- BiocGenerics::unique(gsub(filter_param,
pattern = "-H|-A|-W",
replacement = ""))
filter_param <- filter_param[!filter_param %in% param_f &
filter_param!= TimeChannel]
filter_param <- c(filter_param, "FSC", "SSC")
# Exclude all scatter information from denoising
add_measuredparam <- base::unique(gsub(".*-([A-Z])$","\\1",param))[1]
# Denoise with flowAI
x <- flowAI::flow_auto_qc(x, alphaFR = 0.01,
folder_results = "QC_flowAI",
fcs_highQ = "HighQ",
output = 1,
timeCh = TimeChannel,
ChExcludeFM = paste0(param_f[param_f %in% c("FSC",
"SSC")],
"-", add_measuredparam),
ChExcludeFS = filter_param,
second_fractionFR = timesplit
)
# Subset data to relevant data
x <- x[, param]
# Add sample names again since the QC function removes these
x@phenoData@data$name <- sam_names
flowCore::sampleNames(x) <- sam_names
# Change characters in parameter description from character back to numeric
# Otherwise nothing that follows will work
for(i in 1:length(x)){
x[[i]]@parameters@data[,3] <- as.numeric(x[[i]]@parameters@data[,3])
x[[i]]@parameters@data[,4] <- as.numeric(x[[i]]@parameters@data[,4])
x[[i]]@parameters@data[,5] <- as.numeric(x[[i]]@parameters@data[,5])
}
}
# Subset flowset to only samples in sample data
x <- x[as.character(sample_info[, sample_col])]
# Step 0: Format metadata
Biobase::pData(x) <- base::cbind(Biobase::pData(x),
sample_info[base::order(base::match(as.character(sample_info[, sample_col]),
as.character(Biobase::pData(x)[, "name"]))),
target_label])
base::colnames(Biobase::pData(x))[2] <- target_label
# Step 1: Subset FCS files on parameters of interest
x <- x[, param]
# Step 1b: downsample
cat(paste0("-----------------------------------------------------------------------------------------------------\n"))
x <- Phenoflow::FCS_resample(x, sample = downsample)
cat(paste0("-----------------------------------------------------------------------------------------------------\n"))
# Step 2: add group labels present in pData to each cell and
# merge all cells in a single data.table (faster than dataframe)
for(i in 1:length(x)){
tmp <- data.table::data.table(label = Biobase::pData(x[i])[, target_label],
flowCore::exprs(x[[i]]))
if(i == 1){
full_data <- tmp
} else full_data <- base::rbind(full_data, tmp)
}
full_data <- base::droplevels(full_data)
# if(classification_type == "single-cell"){
# # Step 2: add sample labels to each cell
# flowCore::fsApply(x, use.exprs = TRUE)
# Biobase::pData(x)
# }
# Step 3: Set model parameters
fitControl <- caret::trainControl( ## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 3)
# Step 4: Create data partitions
trainIndex <- caret::createDataPartition(full_data$label, p = p_train)
train_data <- full_data[trainIndex$Resample1, ]
test_data <- full_data[-trainIndex$Resample1, ]
# Step 5: Train Random Forest classifier on training set
metric <- "Accuracy"
mtry <- round(base::sqrt(ncol(train_data)), 0)
tunegrid <- base::expand.grid(.mtry = mtry)
cat(date(), paste0("--- Training Random Forest classifier on ",
100 * p_train,
"% training set with options: \n",
"\t-Performance metric: ", metric, "\n",
"\t-Number of trees: ", 500, "\n",
"\t-mtry: ", round(mtry, 2), "\n",
"\t-method: ", fitControl$number, "x ", fitControl$method, "\n",
"\t-repeats: ", fitControl$repeats, "\n"))
cat(paste0("-----------------------------------------------------------------------------------------------------\n"))
RF_train <- caret::train(label~., data = train_data, method = method,
metric = metric, tuneGrid = tunegrid,
trControl = fitControl, ntree = 500)
print(RF_train)
# cat(date(), paste0("--- Training Xgboost classifier on ",
# 100 * p_train,
# "% training set with options: \n",
# "\t-Performance metric: ", metric, "\n",
# "\t-Number of trees: ", 500, "\n",
# "\t-mtry: ", round(mtry, 2), "\n",
# "\t-method: ", fitControl$number, "x ", fitControl$method, "\n",
# "\t-repeats: ", fitControl$repeats, "\n"))
# cat(paste0("-----------------------------------------------------------------------------------------------------\n"))
# bstSparse <- xgboost(data = train_data$data,
# label = train_data$label,
# max.depth = 2, eta = 1,
# nthread = 2, nround = 2,
# objective = "binary:logistic")
# print(bstSparse)
# Step 6: Accuracy on test set
cat(paste0("-----------------------------------------------------------------------------------------------------\n"))
performance_metrics <- data.frame(metric = 1,
n_cells = c(table(test_data$label)),
label = levels(as.factor(test_data$label)))
for(n_label in 1:length(unique(test_data$label))){
tmp <- test_data[test_data$label == unique(test_data$label)[n_label], ]
tmp_pred <- stats::predict(RF_train, newdata = tmp)
index <- performance_metrics$label == unique(test_data$label)[n_label]
performance_metrics$metric[index] <-
round(100*base::sum(tmp_pred == tmp$label)/performance_metrics$n_cells[index],2)
}
colnames(performance_metrics)[1] <- "Accuracy"
cat(date(), paste0("--- Performance on ",
100 * (1 - p_train),
"% test set: \n"))
print(performance_metrics)
cat(paste0("-----------------------------------------------------------------------------------------------------\n"))
# For confusion matrix:
RF_pred <- stats::predict(RF_train, newdata = test_data)
# Create dataframe containing descision boundary of final model
## Extract minimum and maximum values of FCS files
# summary <- flowCore::fsApply(x = x, FUN = function(x) base::apply(x, 2, base::max), use.exprs = TRUE)
# maxval <- base::apply(summary, 2, max)
# minval <- base::apply(summary, 2, min)
# desc_coord <- list()
# for(i_param in 1:length(param[1:2])){
# desc_coord[[i_param]] <- seq(from = minval[i_param],
# by = ((maxval[i_param] - minval[i_param])/64),
# to = maxval[i_param])
# }
# ## Create vectors containing the parameter value bins for prediction
# desc_pred <- base::expand.grid(desc_coord)
# base::colnames(desc_pred) <- param
#
# ## Predict this dummy data
# RF_pred_desc <- stats::predict(RF_train, newdata = desc_pred)
# RF_pred_desc <- cbind(desc_pred, RF_pred_desc)
# colnames(RF_pred_desc) <- c(param, "label")
# Make list containing model, confusion matrix, summary statistics and descision boundary
results_list <- list()
results_list[[1]] <- RF_train
results_list[[2]] <- caret::confusionMatrix(data = RF_pred, as.factor(test_data$label))
# results_list[[3]] <- RF_pred_desc
# Return diagnostic plots (confusion matrix + descision boundary)
if(plot_fig == TRUE){
# Confusion matrix plot
mytable <- base::round(data.frame(performance = results_list[[2]]$overall), 2)
p_conf <- ggplot2::ggplot(data.frame(results_list[[2]]$table),
ggplot2::aes(x = Prediction, y = Reference, fill = 100*Freq/sum(Freq)))+
ggplot2::geom_raster()+
ggplot2::geom_text(ggplot2::aes(label = round(100*Freq/sum(Freq), 0)), size = 6)+
ggplot2::scale_fill_distiller( name = "% of total cells\n classified\n") +
ggplot2::theme_bw()+
ggplot2::scale_x_discrete(position = "top")+
ggplot2::theme(axis.title=ggplot2::element_text(size=16),
strip.text.x=ggplot2::element_text(size=14),
legend.title=ggplot2::element_text(size=14),
legend.text=ggplot2::element_text(size=14),
axis.text.y = ggplot2::element_text(size=13),
axis.text.x = ggplot2::element_text(size=13, angle = 55, hjust = 0),
title= ggplot2::element_text(size=20),
plot.margin = ggplot2::unit(c(1.1,1.1,1.1,1.1), "cm"),
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
panel.border = ggplot2::element_blank(),
panel.background = ggplot2::element_rect(fill = "transparent",colour = NA),
plot.background =ggplot2::element_rect(fill = "transparent",colour = NA)
)
p_conf_table <- ggplot2::ggplot(data.frame(results_list[[2]]$table),
ggplot2::aes(x = Prediction, y = Reference, fill = 100*Freq/sum(Freq)))+
ggplot2::theme(axis.title=ggplot2::element_text(size=16),
strip.text.x=ggplot2::element_text(size=14),
legend.title=ggplot2::element_text(size=14),
legend.text=ggplot2::element_text(size=14),
axis.text.y = ggplot2::element_text(size=13),
axis.text.x = ggplot2::element_text(size=13, angle = 55, hjust = 0),
title=ggplot2::element_text(size=20),
plot.margin = ggplot2::unit(c(1.1,1.1,1.1,1.1), "cm"),
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
panel.border = ggplot2::element_blank(),
panel.background = ggplot2::element_rect(fill = "transparent",colour = NA),
plot.background = ggplot2::element_rect(fill = "transparent",colour = NA)
)+
ggplot2::ylab("")+
ggplot2::xlab("")+
ggplot2::annotation_custom(gridExtra::tableGrob(mytable))
print(cowplot::plot_grid(p_conf, p_conf_table, align = "h", ncol = 2, rel_widths = c(1/2, 1/5)))
# Show positions of labelled cells for a stratified sample of the test data
# Descision boundary plot
# df_desc <- unique(results_list[[3]][, c(param[1:2], "label")]) # Retain only unique values on two primary parameters
# p_desc <- ggplot2::ggplot(df_desc, ggplot2::aes(x = `FL1-H`,
# y = `FL3-H`,
# fill = label))+
# geom_tile()+
# scale_fill_manual(values = c("red", "blue"))
# Use grid.arrange to put them in one figure
}
# Final step: Return list with final model, confusion matrix, summary statistics
# and descision boundary for further applications
return(results_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.