get_GT_genes <- function(GT_dir) {
gt_sets <- list()
if(!dir.exists(GT_dir)) {
stop("Ground truth directory not found! Make sure to supply the full path.")
} else {
fnames <- list.files(GT_dir, pattern="*.csv", full.names=TRUE)
}
# Append the GT sets to our GT list
lbls <- c()
for(file in fnames) {
tmp <- utils::read.table(file, sep = ",", header = FALSE, stringsAsFactors = FALSE)
lbls <- c(lbls, tmp[1,])
gt_sets <- append(gt_sets, as.data.frame(tmp[-1,], stringsAsFactors = FALSE))
}
names(gt_sets) <- lbls
gt_sets
}
#' Score the k-means clusters.
#'
#' This function takes the k-means clusters generated by generate_clusters and
#' computes the GECO scores for each gene within each cluster. This scores table
#' is meant to be used with the final function in the GECO process:
#' assess_quality.
#'
#' @param km_clusters The k-means clusters generated by generate_clusters
#' @param GT_dir The directory holding the ground truth .csv files
#' @return The scores table containing the GECO scored clusters
#' @examples
#' clusters <- list()
#' # This replicates the structure of the clusters returned by 'generate_clusters'
#' df <- data.frame(replicate(10,sample(-1:10,200,rep=TRUE)))
#' rownames(df) <- paste0(rep("Gene.", 200), seq(1:200))
#' clusters$`Iteration 1`$`10` <- kmeans(df, centers = 10)
#' # This replicates a ground truth set
#' gt_set <- c("Ground Truth Set X", "Gene.1", "Gene.10", "Gene.50")
#' # Store the Ground Truth set in its own directory
#' dir.create("./GECOdata")
#' write.table(gt_set, file = "./GECOData/gs_genes.csv", row.names = FALSE, col.names = FALSE)
#' # Score the clusters using the ground truth sets found in /GECOdata
#' scores <- score_clusters(clusters, "./GECOdata")
#' @export
score_clusters <- function(km_clusters, GT_dir) {
# Get the ground truth sets
gt_sets <- get_GT_genes(GT_dir)
# Ensure that the ground truth genes can be found within the dataset
all_genes <- names(km_clusters[[1]][[1]]$cluster)
for(i in seq_len(length(gt_sets))) {
if(sum(gt_sets[[i]] %in% all_genes) <= 0) {
stop(paste0("No genes from the ground truth set '", names(gt_sets[i]), "' can be found in the data!"))
}
}
# Create the scores list
scores <- list()
gt_cnt <- 0
for(gt_set in gt_sets){
gt_cnt <- gt_cnt + 1
# Store the scoring tables
gt_ROC_list <- vector(mode = "list", length(km_clusters))
# Find number of iterations
num_iter <- length(km_clusters)
# Work through the iterations: 1 - num_iter
for(itr in seq_len(num_iter)) {
clust_itr <- km_clusters[[itr]]
k_vec <- names(clust_itr)
gt_ROC <- vector(mode = "list", sum(clust_itr[[itr]]$size))
# Work through the k-values: kmin - kmax
for(k in k_vec) {
clust <- clust_itr[[k]]
clust_num <- vector(mode = "list", as.numeric(k))
gene_name <- vector(mode = "list", as.numeric(k))
pos_vec <- vector(mode = "list", as.numeric(k))
prob_vec <- vector(mode = "list", as.numeric(k))
# Check each cluster for GT genes
for(i in seq_len(k)) {
gene_name[[i]] <- names(which(clust$cluster == i))
# If cluster is not a singlet
if(clust$size[i]>1) {
clust_num[[i]] <- rep(i, clust$size[i])
pos_vec[[i]] <- gene_name[[i]] %in% gt_set
gt_bf <- ( (length(gt_set) - pos_vec[[i]]) / (sum(clust$size) - 1) )
prob_vec[[i]] <- ( (sum(pos_vec[[i]]) - pos_vec[[i]]) / (clust$size[i] - 1) ) / gt_bf
}
# If cluster is a singlet
else {
clust_num[[i]] <- i
pos_vec[[i]] <- gene_name[[i]] %in% gt_set
prob_vec[[i]] <- length(gt_set) / (sum(clust$size) - 1)
}
}
clust_num <- unlist(clust_num)
gene_name <- unlist(gene_name)
pos_vec <- unlist(pos_vec)
prob_vec <- unlist(prob_vec)
score_tbl <- data.frame(clust_num, gene_name, pos_vec, prob_vec)
gt_ROC[[k]] <- score_tbl
}
gt_ROC <- Filter(Negate(is.null), gt_ROC)
names(gt_ROC) <- k_vec
gt_ROC_list[[itr]] <- gt_ROC
}
names(gt_ROC_list) <- paste0("Iteration ", c(seq_len(num_iter)), sep = "")
scores[[gt_cnt]] <- gt_ROC_list
}
names(scores) <- names(gt_sets)
return(scores)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.