# Author: Monica Golumbeanu <monica.golumbeanu@bsse.ethz.ch>
##############################
#' @title Extracts a time series data frame from a text file
#'
#' @description \code{get_time_series_df} creates a data frame containing time
#' series data from a file.
#' @param data_file path to a tab-delimited text file containing the time series
#' data formatted such that each row contains a time-series represented by its
#' name (e.g. gene name, protein name, etc.) and the values at each time point.
#'
#' @return A data frame containing the time series
#'
#' @author Monica Golumbeanu, \email{monica.golumbeanu@bsse.ethz.ch}
#' @references Golumbeanu M, Desfarges S, Hernandez C, Quadroni M, Rato S,
#' Mohammadi P, Telenti A, Beerenwinkel N, Ciuffi A. (2017) Dynamics of
#' Proteo-Transcriptomic Response to HIV-1 Infection.
#'
#' @examples
#' # Load a simulated toy time-series data provided with the package
#' toy_data_file = system.file("extdata", "toy_time_series.txt",
#' package = "TMixClust")
#' toy_data= get_time_series_df(toy_data_file)
#'
#' # Print the first lines of the resulting data frame
#' print(head(toy_data))
#'
#' @export
#'
get_time_series_df = function(data_file) {
if (!file.exists(data_file))
stop("Input file does not exist.")
if (missing(data_file))
stop("A file containing the data to be loaded needs to be specified.")
data_table = read.table(data_file, header=TRUE, na.strings =" ",
row.names=1, sep="\t")
return(data_table)
}
##############################
#' @title Extracts a time series data frame from a Bioconductor Biobase
#' ExpressionSet object.
#'
#' @description \code{get_time_series_df_bio} creates a data frame with time
#' series data from a Bioconductor Biobase ExpressionSet object.
#' @param bio_obj Bioconductor Biobase ExpressionSet object. The assayData has
#' to contain a matrix where each row is a gene time series and each column
#' contains the time series values at each time point. The number of columns is
#' equal to the number of time points, while the number of rows is equal to the
#' number of genes.
#'
#' @return A data frame containing the time series
#'
#' @author Monica Golumbeanu, \email{monica.golumbeanu@bsse.ethz.ch}
#' @references Golumbeanu M, Desfarges S, Hernandez C, Quadroni M, Rato S,
#' Mohammadi P, Telenti A, Beerenwinkel N, Ciuffi A. (2017) Dynamics of
#' Proteo-Transcriptomic Response to HIV-1 Infection.
#'
#' @import SPEM
#' @import Biobase
#'
#' @examples
#' # Load the SOS pathway data from Bioconductor package SPEM
#' library(SPEM)
#' data(sos)
#' sos_data = get_time_series_df_bio(sos)
#'
#' # Print the first lines of the retrieved time series data frame
#' print(head(sos_data))
#'
#' @export
#'
get_time_series_df_bio = function(bio_obj) {
if (missing(bio_obj))
stop("A Biobase ExpressionSet object needs to be specified.")
data_table = as.data.frame(assayDataElement(assayData(bio_obj), 'exprs'))
return(data_table)
}
##############################
#' @title Plots all the time series stored in a data frame object
#'
#' @description \code{plot_time_series_df} allows the user to visualise the time
#' series from a given data set.
#'
#' @param ts_df data frame containing on each row a time-series
#' @param time_points vector containing the values of the time points.
#' Default: \code{seq_len(ncol(time_series_df))}.
#' @param data_color color of the time series to be used for the plot.
#' Default is orange.
#' @param x_label label of the x axis of the plot. Default is "time"
#' @param y_label label of the y axis of the plot. Default is "value"
#' @param plot_title title of the plot. Default is "Time series plot".
#'
#' @return Plots a figure with all the the time series in the data set
#'
#' @author Monica Golumbeanu, \email{monica.golumbeanu@bsse.ethz.ch}
#' @references Golumbeanu M, Desfarges S, Hernandez C, Quadroni M, Rato S,
#' Mohammadi P, Telenti A, Beerenwinkel N, Ciuffi A. (2017) Dynamics of
#' Proteo-Transcriptomic Response to HIV-1 Infection.
#'
#' @examples
#' # Load the toy time series data provided with the TMixClust package
#' data(toy_data_df)
#'
#' # Plot the time series
#' plot_time_series_df(toy_data_df)
#' @export
#'
plot_time_series_df = function(ts_df, time_points = seq_len(ncol(ts_df)),
data_color="#fd8d3c", x_label="time",
y_label="value", plot_title="Time series plot") {
if (missing(ts_df))
stop("A data-frame containing the time series needs to be provided.")
# Plot the time series
par(mar=c(5,5,5,5), oma=c(0,0,0,0))
data_color = adjustcolor(data_color, alpha.f = 0.5)
yl = range(ts_df, na.rm = TRUE) + c(-1,1)
for (j in seq_len(nrow(ts_df))) {
plot(time_points, ts_df[j,], ylim=yl, axes = FALSE, xlab="", ylab="",
main="", col=data_color, type="l", pch=20,lwd=2)
par(new=TRUE)
}
par(new=FALSE)
# add axes to the plot
axis(side = 1, cex.axis=2.5,cex.lab=2.5)
axis(side = 2,cex.axis=2,cex.lab=2)
title(main=plot_title, xlab = x_label, ylab = y_label,
cex.lab=2, cex.main=2)
}
#############################
#' @title Clusters the time series data in a given number of groups
#'
#' @description \code{TMixClust} is the central function of the package.
#' It clusters the given time series data into a specified number of clusters.
#'
#' @param time_series_df data frame containing the time series.
#' Each row is a time series comprised of the time series name which is also the
#' row name, and the time series values at each time point.
#' @param time_points vector containing numeric values for the time points.
#' Default: \code{seq_len(ncol(time_series_df))}.
#' @param nb_clusters desired number of clusters
#' @param em_iter_max maximum number of iterations for the
#' expectation-maximization (EM) algorithm. Default: 1000.
#' @param mc_em_iter_max maximum number of iterations for Monte-Carlo
#' resampling. Default is 100.
#' @param em_ll_convergence convergence threshold for likelihood improvement.
#' Default is 0.001.
#'
#' @return list object with the following attributes:
#' \itemize{
#' \item \code{em_gss_obj_list} object of class \code{gss} containing
#' estimated parameters of the mixed-effects model
#' (see package vignette for more details).
#' \item \code{em_pi_k} vector containing the mixing coefficients
#' corresponding to each cluster
#' \item \code{em_mat_post} matrix containing the posterior values for each
#' time series and cluster
#' \item \code{em_cluster_assignment} vector with the clustering attribution
#' for each time series
#' \item \code{el_ll} vector containing the log likelihood values at each
#' iteration in the EM algorithm
#' \item \code{ts_data} the same as the input time series data-frame
#' \item \code{ts_time_points} the same as the input time-points vector
#' }
#'
#' @author Monica Golumbeanu, \email{monica.golumbeanu@bsse.ethz.ch}
#' @references Golumbeanu M, Desfarges S, Hernandez C, Quadroni M, Rato S,
#' Mohammadi P, Telenti A, Beerenwinkel N, Ciuffi A. (2017) Dynamics of
#' Proteo-Transcriptomic Response to HIV-1 Infection.
#'
#' @importFrom mvtnorm dmvnorm
#' @import gss
#' @import stats
#' @importFrom utils read.table head
#'
#' @examples
#' # Load the toy time series data provided with the TMixClust package
#' data(toy_data_df)
#'
#' # Cluster the toy data with default parameters
#' TMixClust_obj = TMixClust(toy_data_df)
#'
#' @export
#'
TMixClust = function(time_series_df,
time_points = seq_len(ncol(time_series_df)),
nb_clusters = 2, em_iter_max = 1000, mc_em_iter_max = 10,
em_ll_convergence = 0.001) {
# input checking conditions
if (missing(time_series_df))
stop("A data frame containing time series needs to be provided.")
if(!(length(time_points)==ncol(time_series_df)))
stop("The provided time series vector has different length than the
number of columns in the time series data frame.")
if(!(nb_clusters==round(nb_clusters) && nb_clusters > 1))
stop("The number of clusters has to be a positive integer
larger than 1.")
if(!(em_iter_max==round(em_iter_max) && em_iter_max > 1))
stop("The number of iterations for EM algorithm has to be a positive
integer larger than 1.")
if(!(mc_em_iter_max==round(mc_em_iter_max) && mc_em_iter_max > 1))
stop("The number of iterations for the MC EM has to be a positive
integer larger than 1.")
if(em_ll_convergence < 0)
stop("The convergence threshold for the likelihood convergence has to
be positive.")
# obtain starting configuration for EM with kmeans
message("Initializing ...")
k_init = init_kmeans(time_series_df, nb_clusters, time_points)
# perform EM to estimate the parameters of the model and cluster the time
# series based on their posterior
message("Performing EM, this operation might take a few minutes ...")
em_clust = do_EM(time_series_df, time_points, nb_clusters, k_init,
em_iter_max, mc_em_iter_max, em_ll_convergence)
message("Clustering done, results stored in a TMixClust object.")
return(em_clust)
}
#############################
#' @title Generates a series of files containing a summary of the TMixClust
#' analysis results
#'
#' @description \code{generate_TMixClust_report}
#'
#' @param TMixClust_object list object created by the \code{TMixClust}
#' function (see function \code{TMixClust})
#' @param report_folder full path of the folder where the report files will
#' be saved. Default is TMixClust_report/ folder in current working directory.
#' @param data_color color of the time series to be used when generating the
#' cluster plots. Default is orange.
#' @param x_label label of the x axis for the cluster plots. Default is "time"
#' @param y_label label of the y axis for the cluster plots. Default is "value"
#'
#' @return Produces a series of files containing information about the
#' clustering results and saves them in the provided folder location.
#' The folder contains the following:
#' \itemize{
#' \item \code{log-lihelihood.txt} - file with the log likelihood values at
#' each iteration on separate lines
#' \item \code{log-likelihood.pdf} - plot of log-likelihood at each iteration
#' \item \code{posterior.txt} - file with the posterior probabilities of all
#' the time-series for each cluster
#' \item \code{estimated_curves/} - folder containing a number of files equal to
#' the number of clusters; each file has 4 lines consisting of curve values and
#' their confidence intervals (first 3 lines) for a discrete time grid
#' (last line).
#' \item \code{clusters/} - folder containing a plot with the time series in
#' each cluster, a silhouette plot of the clustering configuration, as well as,
#' for each cluster, a file containing the names of the time series in the
#' respective cluster and a file containing the names and time series values
#' for the time series in each cluster.
#' }
#'
#' @examples
#' \dontrun{
#' # Load the toy time series data provided with the TMixClust package
#' data(toy_data_df)
#'
#' # Cluster the toy data with default parameters
#' TMixClust_obj = TMixClust(toy_data_df)
#'
#' # Generate a TMixClust report in the current working directory
#' generate_TMixClust_report(TMixClust_obj)
#' }
#'
#' @author Monica Golumbeanu, \email{monica.golumbeanu@bsse.ethz.ch}
#' @references Golumbeanu M, Desfarges S, Hernandez C, Quadroni M, Rato S,
#' Mohammadi P, Telenti A, Beerenwinkel N, Ciuffi A. (2017) Dynamics of
#' Proteo-Transcriptomic Response to HIV-1 Infection.
#'
#' @import gss
#' @import stats
#' @import cluster
#' @importFrom utils read.table write.table
#' @importFrom grDevices adjustcolor col2rgb dev.off pdf
#' @importFrom graphics abline axis par plot title
#'
#' @export
#'
generate_TMixClust_report = function(TMixClust_object, report_folder=paste(
getwd(),"/TMixClust_report/",sep=""), data_color="#fd8d3c", x_label="time",
y_label="value") {
if (missing(TMixClust_object))
stop("A TMixClust object needs to be provided.")
if (is.null(TMixClust_object))
stop("The TMixClust object is empty.")
tryCatch( {
col2rgb(data_color)
} , error = function(e) {
message(e)
stop("Provided data color is not a valid color character.")
})
# extract number of clusters
nb_clusters = length(TMixClust_object$em_pi_k)
# check folder name
if (!substring(report_folder, nchar(report_folder)) == "/") {
report_folder = paste(report_folder,"/", sep="")
}
# create the necessary folders
if (!dir.exists(report_folder)){
tryCatch({
message("creating report folder")
dir.create(report_folder)
}, warning = function(w) {
message(w)
}, error = function(e) {
message(e)
stop("There was a problem while creating the report folder.
Check if you have writing permission or the specified path
is correct.")
})
}
if (!dir.exists(paste(report_folder,"clusters/", sep=""))){
dir.create(paste(report_folder,"clusters/", sep=""))
}
if (!dir.exists(paste(report_folder,"estimated_curves/", sep=""))){
dir.create(paste(report_folder,"estimated_curves/", sep=""))
}
# retrieve the curves with Bayesian confidence intervals
for (k in seq_len(nb_clusters)) {
genes_k = which(TMixClust_object$em_cluster_assignment==k)
if (length(genes_k)>0){
if (!is.null(TMixClust_object$em_gss_obj_list[[k]]$fit_model)) {
est_curve = matrix(0, nrow = 4, ncol = 100)
time_grid = seq(min(TMixClust_object$ts_time_points),
max(TMixClust_object$ts_time_points),
length = 100)
grid_predict = predict(
TMixClust_object$em_gss_obj_list[[k]]$fit_model,
data.frame(tm = time_grid), se.fit=TRUE)
est_curve[1,] = grid_predict$fit + 1.96*grid_predict$se.fit
est_curve[2,] = grid_predict$fit - 1.96*grid_predict$se.fit
est_curve[3,] = grid_predict$fit
est_curve[4,] = time_grid
# write the curve and its confidence intervals to file
write.table(est_curve,paste(
report_folder,"estimated_curves/estimated_curve_cluster_",k
,".txt",sep=""), quote = FALSE, row.names = FALSE,
col.names = FALSE)
}
# write the cluster assignments to file
write.table(TMixClust_object$ts_data[genes_k,],
paste(report_folder,"clusters/cluster",k,".txt",sep=""),
quote = FALSE)
write.table(row.names(TMixClust_object$ts_data[genes_k,]),
paste(report_folder,"clusters/cluster",k
,"_names.txt",sep=""), quote = FALSE,
row.names = FALSE, col.names = FALSE)
}
}
# plot clusters time series
plot_clusters(TMixClust_object$ts_data, TMixClust_object$ts_time_points,
nb_clusters, TMixClust_object$em_cluster_assignment,
report_folder, data_color, x_label, y_label)
# calculate silhouette coefficients for each cluster and
# plot their distributions
complete_ts = na.omit(TMixClust_object$ts_data)
not_omitted = (rownames(TMixClust_object$ts_data)%in%rownames(complete_ts))
m = data.matrix(complete_ts, rownames.force = FALSE)
similarity_matrix = daisy(m)
sil <- silhouette(TMixClust_object$em_cluster_assignment[not_omitted],
similarity_matrix)
pdf(paste(report_folder,"clusters/silhouette.pdf",sep=""),
useDingbats = FALSE)
plot(sil, col = "#bdbdbd", border = "#bdbdbd",
main = paste("Silhouette plot for K=",
nb_clusters, " clusters", sep=""))
abline(v = mean(sil[,"sil_width"]), col="black", lwd=3, lty=2)
dev.off()
# write the likelihood values at each EM iteration to a file and plot it
write.table(TMixClust_object$em_ll,
paste(report_folder,"log-likelihood.txt",sep=""),
quote = FALSE, row.names = FALSE, col.names = FALSE)
plot_log_like(TMixClust_object$em_ll,
paste(report_folder,"log-likelihood.pdf",sep=""))
# write the posterior probabilities to a file
final_posterior = TMixClust_object$em_mat_post
rownames(final_posterior) = rownames(TMixClust_object$ts_data)
write.table(final_posterior, paste(report_folder,"posterior.txt",sep=""),
sep= "\t", quote = FALSE, col.names = FALSE)
}
#############################
#' @title Generates a silhouette plot for a given clustering configuration.
#'
#' @description \code{plot_silhouette}
#'
#' @param TMixClust_object list object created by the \code{TMixClust}
#' function (see function \code{TMixClust})
#' @param sim_metric character string taking one of the possible values:
#' "euclidean", "gower" or "manhattan". Default is "euclidean".
#' @param sil_color color of the bars representing the silhouette widths
#' on the plot
#'
#' @return List object with the following components:
#' \itemize{
#' \item \code{similarity_m} similarity matrix
#' \item \code{silh} silhouette object
#' }
#'
#' Renders a plot comprised of a set of barplots with the distributions of
#' silhouette
#' coefficients for the data points in each cluster. Each barplot has indicated
#' on its right hand side the total number of points in the corresponding
#' cluster.
#' The plot also indicates with a dotted line,
#' the overall average silhouette width,
#' whose value is specified at the bottom of the plot.
#'
#' @examples
#' # Load the TMixClust object associated to the toy time series data
#' # provided with the TMixClust package
#' data(best_clust_toy_obj)
#'
#' # Plot the silhouette for the clustering stored in the toy TMixClust object
#' plot_silhouette(best_clust_toy_obj)
#'
#' @author Monica Golumbeanu, \email{monica.golumbeanu@bsse.ethz.ch}
#' @references Golumbeanu M, Desfarges S, Hernandez C, Quadroni M, Rato S,
#' Mohammadi P, Telenti A, Beerenwinkel N, Ciuffi A. (2017) Dynamics of
#' Proteo-Transcriptomic Response to HIV-1 Infection.
#'
#' @import gss
#' @import stats
#' @import cluster
#' @importFrom grDevices adjustcolor col2rgb dev.off pdf
#' @importFrom graphics abline axis par plot title
#'
#' @export
#'
plot_silhouette = function(TMixClust_object, sim_metric="euclidean",
sil_color="#bdbdbd") {
# extract the number of clusters
nb_clusters = length(TMixClust_object$em_pi_k)
# calculate silhouette coefficients for each cluster and
# plot the distributions
# this works only for the time-series without missing values,
# therefore we have to first remove the data points with missing values:
complete_ts = na.omit(TMixClust_object$ts_data)
not_omitted = (rownames(TMixClust_object$ts_data)%in%rownames(complete_ts))
m = data.matrix(complete_ts, rownames.force = FALSE)
# compute similarity matrix and silhouette coefficients
similarity_matrix = daisy(m, metric = sim_metric)
sil <- silhouette(TMixClust_object$em_cluster_assignment[not_omitted],
similarity_matrix)
# create silhouette plot
plot(sil, col = sil_color, border = sil_color,
main = paste("Silhouette plot for K=", nb_clusters,
" clusters", sep=""))
abline(v = mean(sil[,"sil_width"]), col="black", lwd=3, lty=2)
return(list(similarity_m = m, silh = sil))
}
#############################
#' @title Stability analysis, clustering evaluation and optimal solution
#' selection
#'
#' @description \code{analyse_stability} Performs multiple clustering runs
#' with TMixClust, analyses the agreement between runs
#' with the Rand index and returns the clustering solution with the largest
#' likelihood.
#' A plot of agreement probability between all the runs and the run with the
#' maximum likelihood is produced.
#'
#' @param time_series_df data frame containing the time series.
#' Each row is a time series comprised of the time series name which is also
#' the row name, and the time series values at each time point.
#' @param time_points vector containing numeric values for the time points.
#' Default: \code{seq_len(ncol(time_series_df))}.
#' @param nb_clusters desired number of clusters
#' @param em_iter_max maximum number of iterations for the
#' expectation-maximization (EM) algorithm. Default: 1000.
#' @param mc_em_iter_max maximum number of iterations for Monte-Carlo
#' resampling. Default is 10.
#' @param em_ll_convergence convergence threshold for likelihood improvement.
#' Default is 0.001.
#' @param nb_clustering_runs number of times the clustering procedure is
#' repeated on the input data. Default is 3.
#' @param nb_cores number of cores to be used to run the separate clustering
#' operations in parallel. Default is 1.
#'
#' @return TMixClust object with the highest likelihood.
#' Renders a plot showing the overall distribution of the Rand index, which
#' allows the user to assess clustering stability.
#'
#' @examples
#' # Load the toy time series data provided with the TMixClust package
#' data(toy_data_df)
#'
#' # Identify the most optimal clustering solution with 3 clusters
#' best_clust_obj = analyse_stability(toy_data_df, nb_clusters = 3,
#' nb_clustering_runs = 4, nb_cores = 1)
#'
#' # Plot the time series from each cluster
#' for (i in seq_len(3)) {
#' # Extract the time series in the current cluster and plot them
#' c_df=toy_data_df[which(best_clust_obj$em_cluster_assignment==i),]
#' plot_time_series_df(c_df, plot_title = paste("cluster",i))
#' }
#'
#' @author Monica Golumbeanu, \email{monica.golumbeanu@bsse.ethz.ch}
#' @references Golumbeanu M, Desfarges S, Hernandez C, Quadroni M, Rato S,
#' Mohammadi P, Telenti A, Beerenwinkel N, Ciuffi A. (2017) Dynamics of
#' Proteo-Transcriptomic Response to HIV-1 Infection.
#'
#' @import BiocParallel
#' @importFrom flexclust randIndex
#' @importFrom grDevices adjustcolor col2rgb dev.off pdf
#' @importFrom graphics abline axis par plot title hist
#'
#' @export
#'
analyse_stability = function(time_series_df,
time_points = seq_len(ncol(time_series_df)),
nb_clusters = 2, em_iter_max = 1000,
mc_em_iter_max = 10, em_ll_convergence = 0.001,
nb_clustering_runs=3, nb_cores=1){
# input checking conditions
if (missing(time_series_df))
stop("A data frame containing time series needs to be provided.")
if(!length(time_points)==ncol(time_series_df))
stop("The provided time series vector has different length than
the number of columns in the time series data frame.")
if(!(nb_clusters==round(nb_clusters) && nb_clusters > 1))
stop("The number of clusters has to be a positive integer larger
than 1.")
if(!(em_iter_max==round(em_iter_max) && em_iter_max > 1))
stop("The number of iterations for EM algorithm has to be a
positive integer larger than 1.")
if(!(mc_em_iter_max==round(mc_em_iter_max) && mc_em_iter_max > 1))
stop("The number of iterations for the MC EM has to be a positive
integer larger than 1.")
if(em_ll_convergence < 0)
stop("The convergence threshold for the likelihood convergence has to
be positive.")
if(!(nb_clustering_runs==round(nb_clustering_runs) &&
nb_clustering_runs >= 1))
stop("The number of clustering runs has to be a positive integer
strictly larger than 0.")
if(!(nb_cores==round(nb_cores) && nb_cores >= 1))
stop("The number of computing cores has to be a positive integer
strictly larger than 0.")
TMixClust_wrapper = function(n){
TMixClust(time_series_df, time_points, nb_clusters,em_iter_max,
mc_em_iter_max, em_ll_convergence)
}
cl_obj = bplapply(seq_len(nb_clustering_runs), TMixClust_wrapper,
BPPARAM = MulticoreParam(workers = nb_cores))
# find the solution with the highest likelihood
max_pos = which.max(lapply(cl_obj, function(x) x$em_ll[length(x$em_ll)] ))
best_clust_obj = cl_obj[[max_pos]]
# calculate the rand index between each TMixClust object and the object with
# the highest likelihood
rand_ind = NULL
for (i in seq_len(nb_clustering_runs)) {
g1 = cl_obj[[i]]$em_cluster_assignment
g2 = cl_obj[[max_pos]]$em_cluster_assignment
tab <- table(g1, g2)
rand_ind[i] = randIndex(tab)
}
# plot the distribution of the obtained Rand indexes
hist(rand_ind, xlab = "rand index",
main = "Distribution of agreement probability",
col = "#2c7fb8", breaks = 10)
return(best_clust_obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.