#' @title The CountsClust function- a function for clustering and visualization of counts!
#'
#' @param data counts data, with samples along the rows and features along the columns.
#' @param nclus_vec a vector of topics/clusters that user wants to fit.
#' @param samp_metadata the sample metadata, samples along the rows and each column representing some metadata information
#' @param tol the tolerance value for topic model fit (set to 0.001 as default)
#' @param batch_lab the batch labels, the output will have one Structure plot arranged by batch labels too.
#' @param use_parallel equal to TRUE if parallel implementation / default is FALSE
#' @param use_tsne equal to TRUE if the user wants to compute t-SNE / default is TRUE
#' @param use_pca equal to TRUE is the user wants to compute the PCA/ default is TRUE
#' @param thresh_prop The threshold proportion used to remove/substitute the columns of data containing NA. Deafult is 0 implying that all columns
#' with NA are removed.
#' @param filter_prop The threshold proportion that determines if the feature is too sparse or not. Default is 0.9.
#' @param expand_factor An expansion factor used to obtain batch corrected counts
#' @param partition A logical vector of same length as metadata. partition[i]=TRUE will imply that for the Structure
#' plot for i th metadata, no vertical line parititon between classes is used.
#' @param top_features The number of top features per cluster that drives away that cluster from others. Default value is 10
#' @param method The underlying model assumed for KL divergence measurement. Two choices considered- "bernoulli" and "poisson"
#' @param control() A list of control parameters for the Structure plot. The control list has the arguments
#' struct.width, struct.height, cex.axis, cex.main, lwd,las and color and margin parameters.
#'
#' @description This function takes the counts data (no. of samples x no. of features), the vector of topics/clusters the user wants to fit, along
#' with the sample metadata and batch label information and it produces the Structure plots with and without controlling fro batch effects for
#' different number of topics, with arrangement determined by the sample metadata. Additionally, it outputs the cluster data output and model Bayes
#' factor, and also plots t-SNE and the PCA for both with and without batch cluster outputs.
#'
#' @author Kushal K Dey, Matthew Stephens
#'
#' @references Matt Taddy.On Estimation and Selection for Topic Models. AISTATS 2012, JMLR W\&CP 22.
#'
#' Pritchard, Jonathan K., Matthew Stephens, and Peter Donnelly. Inference of population structure using multilocus genotype data.
#' Genetics 155.2 (2000): 945-959.
#'
#' Broman KW. R/qtlcharts: interactive graphics for quantitative trait locus mapping. 2015. Genetics 199:359-361
#'
#' L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms. Journal of Machine Learning Research, 2014:3221-3245
#'
#' @keywords counts data, clustering, topic model, Structure plot
#'
#' @export
#'
countsclust <- function(data,
nclus_vec,
samp_metadata,
samp_lab_tsne,
tol=0.001,
batch_lab,
use_parallel=FALSE,
use_tsne=TRUE,
use_pca=TRUE,
thresh_prop=0,
filter_prop=0.9,
expand_factor=100,
feature_extr_method=c("poisson","bernoulli"),
top_features=10,
partition=rep('TRUE',ncol(samp_metadata)),
control=list())
{
data_preprocessed <- handleNA(data,thresh_prop=thresh_prop)$data;
data_filtered <- RemoveSparseFeatures(data_preprocessed,filter_prop = filter_prop)$data;
counts <- data_filtered;
rm(data_filtered)
control.default <- list(struct.width=600, struct.height=400, cex.axis=0.5, cex.main=1.5, las=2, lwd=2,
mar.bottom =14, mar.left=2, mar.top=2, mar.right=2,color=2:(dim(omega)[2]+1));
namc=names(control)
if (!all(namc %in% names(control.default)))
stop("unknown names in control: ", namc[!(namc %in% names(control.default))])
control=modifyList(control.default, control)
if(!dir.exists("Structure")) dir.create("Structure")
if(!dir.exists("Structure/batch_uncorrected")) dir.create("Structure/batch_uncorrected")
bayesfac <- array(0,length(nclus_vec));
tsne_out <- vector("list", length(nclus_vec));
imp_features <- vector("list", length(nclus_vec));
if(use_tsne){
if(!dir.exists("tSNE")) dir.create("tSNE")
if(!dir.exists("tSNE/batch_uncorrected")) dir.create("tSNE/batch_uncorrected")
}
for(num in 1:length(nclus_vec))
{
if(!dir.exists(paste0("Structure/batch_uncorrected/clus_",nclus_vec[num]))) dir.create(paste0("Structure/batch_uncorrected/clus_",nclus_vec[num]))
obj <- StructureObj(counts,nclus_vec, samp_metadata = samp_metadata, tol=tol, batch_lab = batch_lab, path=paste0("Structure/batch_uncorrected/clus_",nclus_vec[num]), control=controlinput);
bayesfac[num] <- obj$bf;
if(use_tsne){
if(!dir.exists(paste0("tSNE/batch_uncorrected/clus_",nclus_vec[num]))) dir.create(paste0("tSNE/batch_uncorrected/clus_",nclus_vec[num]))
tsne_out[[num]] <- tsne_struct(obj$omega,samp_lab_tsne,paste0("tSNE/batch_uncorrected/clus_",nclus_vec[num]));
}
if(feature_extr_method=="poisson")
imp_features[[num]] <- unique(as.vector(ExtractTopFeatures(obj$theta,top_features=top_features,method="poisson")))
else
imp_features[[num]] <- unique(as.vector(ExtractTopFeatures(obj$theta,top_features=top_features,method="bernoulli")))
}
if(is.null(batch_lab))
{
ll <- list("tsne"=tsne_out,"tsne_batchcorrect"=as.numeric(),"imp_features"=imp_features,"imp_features_batchcorrect"=as.numeric(),
"bayes.fac"=bayesfac,"bayes.fac.batchcorrect"=as.numeric())
return(ll)
}
if(!is.null(batch_lab)){
batch_corrected_counts <- BatchCorrectedCounts(counts,expand_factor = expand_factor,batch_lab)
if(!dir.exists("Structure/batch_corrected")) dir.create("Structure/batch_corrected")
if(use_tsne){
if(!dir.exists("tSNE/batch_corrected")) dir.create("tSNE/batch_uncorrected")}
bayesfac_batchcorrect <- array(0,length(nclus_vec));
tsne_out_batchcorrect <- vector("list", length(nclus_vec));
imp_features_batchcorrect <- vector("list", length(nclus_vec));
for(num in 1:length(nclus_vec))
{
if(!dir.exists(paste0("Structure/batch_corrected/clus_",nclus_vec[num]))) dir.create(paste0("Structure/batch_corrected/clus_",nclus_vec[num]))
obj <- StructureObj(batch_corrected_counts,nclus_vec[num],samp_metadata = samp_metadata, tol=tol, batch_lab = batch_lab, path=paste0("Structure/batch_corrected/clus_",nclus_vec[num]),control=controlinput);
bayesfac_batchcorrect[num] <- obj$bf;
if(use_tsne){
if(!dir.exists(paste0("tSNE/batch_corrected/clus_",nclus_vec[num]))) dir.create(paste0("tSNE/batch_corrected/clus_",nclus_vec[num]))
tsne_out_batchcorrect[[num]] <- tsne_struct(obj$omega,samp_lab_tsne,paste0("tSNE/batch_corrected/clus_",nclus_vec[num]));
}
if(feature_extr_method=="poisson")
imp_features_batchcorrect[[num]] <- unique(as.vector(ExtractTopFeatures(obj$theta,top_features=top_features,method="poisson")))
else
imp_features_batchcorrect[[num]] <- unique(as.vector(ExtractTopFeatures(obj$theta,top_features=top_features,method="bernoulli")))
}
ll <- list("tsne"=tsne_out,"tsne_batchcorrect"=tsne_out_batchcorrect,"imp_features"=imp_features,"imp_features_batchcorrect"=imp_features_batchcorrect,
"bayes.fac"=bayesfac,"bayes.fac.batchcorrect"=bayesfac_batchcorrect)
return(ll)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.