#' Ranks NanoString normalizations using cluster validity indices
#'
#' @description ranks NanoString normalisation by 1) clustering of
#' groups and; 2) variation in relative log expression.
#' Clustering is assessed by a Generalised Dunn Index using the
#' average linkage and average diamter for inter and intracluster
#' distances, respectively. Overall variation is assessed by Relative
#' Log Expression.
#'
#' @param count_set a normalised, count_set summarized experiment
#' generated by count_set.Default = NULL
#'
#' @examples
#'# biological groups
#' rnf5_group <- c(rep("WT", 5), rep("KO", 5))
#'
#' # sample ids
#' rnf5_sampleid <- c("GSM3638131", "GSM3638132", "GSM3638133", "GSM3638134",
#' "GSM3638135", "GSM3638136", "GSM3638137", "GSM3638138",
#' "GSM3638139", "GSM3638140")
#'
#' # build count_set
#' rnf5_count_set <- count_set(count_data = Rnf5,
#' group = rnf5_group,
#' samp_id = rnf5_sampleid)
#' # normalize
#' rnf5_count_set_norm <- multi_norm(count_set = rnf5_count_set,
#' positive_control_scaling = TRUE,
#' background_correct = "mean2sd")
#' #plot_dir = "~/Dropbox/git/NanoStringClustR/plot_test/")
#' # rank normalizations
#' rnf5_eval <- norm_rank(rnf5_count_set_norm)
#'
#' @return ranked normalizations
#'
#' @export norm_rank
#'
#' @importFrom clv cls.scatt.data clv.Dunn
#'
#'
#' @references
#'
#' Lukasz Nieweglowski (2013). clv: Cluster Validation Techniques.
#' R package version 0.3-2.1. https://CRAN.R-project.org/package=clv
#'
norm_rank <- function(count_set = count_set){
####### Input checks #######------------------------------------------------
# check count_set is provided
if(is.null(count_set)) stop("A count_set list generated by count_set must
be provided.")
if(!is.null(count_set)) {
# check the file format
if(count_set@class != "SummarizedExperiment"){
stop(paste("summarized file provided is not a SummarizedExperiment,
Please produce a SummarizedExperiment using count_set.", "\n",
sep=""))
}
}
# check that the count_set has been normalised
if(length(names(assays(count_set))) == 1)
stop("The count_set input must be normalised with multi_norm")
####### Input checks finished #######-----------------------------------------
### refactorise count_set groups
count_set$group <- factor(count_set$group)
### dunn indicies for groups ### --------------------------------------------
assays_all <- names(assays(count_set))
all_group_dunns <- lapply(seq_along(assays_all), function (i){
dunn_wrap(count_set = count_set,
norm_method = assays_all[i],
clusters = as.vector(as.integer((count_set$group))))
})
names(all_group_dunns) <- assays_all
combined_group_dunns <- lapply(seq_along(names(all_group_dunns)), function(i){
method_i <- data.frame("norm_method" = names(all_group_dunns)[i],
"GDI" = all_group_dunns[[i]][1])
})
group_dunns <- do.call("rbind", combined_group_dunns)
### rank group dunns ### -------------------
### lower rank = higher GDI = better clustering
group_dunns_ordered <- group_dunns[order(group_dunns$GDI), ]
group_dunns_ordered$GDI_rank <- c(nrow(group_dunns_ordered):1)
### variation around median RLE ### -------------------------------------------
all_vars <- lapply(seq_along(assays_all), function(i){
variation_around_med(count_set = count_set,
norm_method = assays_all[i])
})
names(all_vars) <- assays_all
vars <- as.data.frame(do.call("rbind", all_vars))
vars_df <- data.frame("norm_method" = rownames(vars),
"MRLE" = vars$V1)
### rank RLE ### -------------------
### lower rank = lower sum variation = better RLE
vars_ordered <- vars_df[order(vars_df$MRLE),]
vars_ordered$MRLE_rank <- c(1:nrow(vars_ordered))
### overall ranks ### -------------------------------------------
merged <- merge(vars_ordered, group_dunns_ordered, by = "norm_method")
merged$NanoNormRank <- merged$MRLE_rank + merged$GDI_rank
merged_ordered <- merged[order(merged$NanoNormRank),]
return(merged_ordered)
}
######## function for dunn index ranking ####-----------------------------------
dunn_wrap <- function(count_set = count_set,
norm_method = "housekeeping_scaled",
clusters = as.vector(as.integer((count_set$group)))){
get_counts <- paste0("assays(count_set)$", norm_method)
data_in <- na.omit(eval(parse(text=get_counts)))
cluster_data_euk <- clv::cls.scatt.data(t(data_in),
clust = clusters,
dist = "euclidean")
#calculate GDI with average intra and intercluster distances
intraclust = c("average")
interclust = c("average")
dunn_data_euk <- clv::clv.Dunn(cluster_data_euk, intraclust, interclust)
return(dunn_data_euk)
}
####### function for calculating sum variation around median RLE ##### -------
variation_around_med <- function(count_set = count_set,
norm_method = "housekeeping_scaled"){
get_counts <- paste0("assays(count_set)$", norm_method)
data_in <- na.omit(eval(parse(text=get_counts)))
#calculate median expression of each gene
med_gene <- apply(data_in, 1, median)
#calculate deviations from this median genecount-med_gene
dev_from_med <- apply(data_in, 2, function(i) (i - med_gene))
#calculate median deviations per sample - median of RLE plpot
med <- apply(dev_from_med, 2, function(i) median(i))
#sum absolute differences of the medians from 0
sum_delta <- sum(abs(med))
return(sum_delta)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.