#General functionalities to cluster data and add cluster assignments to ts- and enrichment data
#' Add enrichment data and preprocess for analysis
#'
#' @param plist List storing patient time series data (also see function: \link{patient_list})
#' @param path Path where enrichment csv file is stored
#'
#' @return Processed data as object of type data frame; Enrichment data Patient_IDs are matched with Time Series Data List Patient IDs; In case it was indicated, NA values in the enrichment data are filled up by random sampling
#'
#' @import utils
#' @import dplyr
#'
#' @examples
#' list <- patient_list('.../ts_demofiles1') #Just folder; files can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/ts_demofiles1)
#' #Sampling frequency is supposed to be daily
#' enr <- add_enrich(list, '.../enrichment_dat.csv') #file can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/enrichment)
#'
#' @export
add_enrich <- function(plist, path) {
#Read csv-file on indicated path; empty fields in csv-file are filled up as NA
dat <- read.csv(path, na.strings=c("","NA"))
#Print first 5 lines of csv and present to user
cat("\n\n") #two empty lines as space before printing
print(head(dat[1:5,]))
cat("\n\n") #two empty lines after printing
#Create a data frame with the colnmaes and assigned ascending numbers to the colnames
feedback <- matrix(nrow = 2, ncol = length(colnames(dat)))
feedback[1,] <- colnames(dat)
feedback[2,] <- 1:length(dat)
feedback <- as.data.frame(feedback)
#Print the created feedback data frame
print(feedback, row.names=F, col.names=F, quote=F)
cat("\n")
#Ask the user which number (= which column) represents the Patient_ID
#This is of interest since later functionalities recognize the Patient_ID and
#do not consider them as a population parameter itself
question1 <- as.numeric(readline("Which column represents the Patient_ID (please indicate number)?: "))
#Rename column as "Patient_ID" so that later functionalities always recognize
#the patient column
names(dat)[names(dat) == feedback[1,question1]] <- 'Patient_ID'
#The enrichment data frame should be so prepared that Patient_ID's that appear in
#the indicated csv-file but not in the patient data list are thrown out and
#Patient_ID's that appear in the patient data list but not in the csv-file are
#added to the data frame and every parameter is filled with NA
#Get Patient_ID's from both previously loaded csv-file and patient time
#series data list
names_enr <- as.character(as.data.frame(table(dat[,'Patient_ID']))[,1])
names_ts <- names(plist)
#Check which patients are in enrich but not ts and throw them out
throw <- as.vector(dplyr::setdiff(names_enr,names_ts))
#Throw out Patient_ID's that do not appear in time series data list
dat_new1 <- filter(dat, !Patient_ID %in% throw)
#Check which patients are in ts but not enrich and add them with NAs
add <- dplyr::setdiff(names_ts,names_enr)
#In case there are Patient_ID's to add to the enrichment data frame
if (length(add > 0)) {
#Make a new matrix with same number of columns and colnames than csv-file
#Number of rows are number of Patient_ID's to be added
dat_new2 <- matrix(data = NA, nrow = length(add), ncol = length(colnames(dat)))
#Transform matrix to data frame
dat_new2 <- as.data.frame(dat_new2)
#Change the colnames to the colnames of the csv-file
colnames(dat_new2) <- colnames(dat)
#Add missing Patient_ID's to the Patient_ID column
dat_new2[,"Patient_ID"] <- add
#Rowbind the previously created data frame with the original enrichment data frame
dat_new <- rbind(dat_new1,dat_new2)
} else {
#In case there are no Patient_ID's to add, the previous data frame can be taken over
dat_new <- dat_new1
}
#In case there are NA entries in the data frame
if (any(is.na(dat_new))) {
#Ask user how NA values shoud be handled
question <- readline("Do you want to leave missing values as NA (1) or let them sample (2)?: ")
if (question == 2) {
#Go through each row and column
for (i in 1:ncol(dat_new)) {
for (j in 1:nrow(dat_new)) {
#Check in each cell if value is NA
if (is.na(dat_new[j,i])) {
#Take all non-NA values from column
samp <- na.omit(dat_new[,i])
#Randomly sample one value on NA position
dat_new[j,i] <- sample(samp,1)
}
}
}
}
}
dat_new
}
#' Add clustering assignments to enrichment data frame
#'
#' @param enrich Preprocessed enrichment data frame (also see function: \link{add_enrich})
#' @param clustdat Object of type list storing clustering data (also see function: \link{clust_matrix})
#'
#' @return Processed enrichment data frame with added column indicating cluster assignments
#'
#' @import tibble
#'
#' @examples
#' list <- patient_list('.../ts_demofiles1') #Just folder; files can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/ts_demofiles1)
#' #Sampling frequency is supposed to be daily
#' clustering <- clust_matrix(matrix, method = "kmeans", nclust = 3)
#' enr <- add_enrich(list, '.../enrichment_dat.csv') #file can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/enrichment)
#' enr <- add_clust2enrich(enr, clustering)
#'
#' @export
add_clust2enrich <- function(enrich, clustdat) {
#Get vector from clustdat list where cluster assignments to patients are stored
#(also see function clust_maxtrix())
new <- as.data.frame(clustdat$Cls)
new <- tibble::rownames_to_column(new)
#Ordering by Patient_ID later facilitates assignment since data frame "new"
#and enrichment data frame have the same order so only cbind needs to be done
new <- new[order(new$rowname),]
#Remove column with Patient_ID's afterwards, otherwise Patient_ID would be
#stored twice in final data frame
new[,"rowname"] <- NULL
#Rename cluster column
colnames(new) <- "Cluster"
#Do similar as above: Order enrichment data frame by Patient_ID
enrich <- enrich[order(enrich$Patient_ID),]
#Combine both data frame
enrich <- cbind(enrich,new)
}
#' Add clustering assignments to time series data
#'
#' @param plist List storing patient time series data (also see function: \link{patient_list})
#' @param clustdat Object of type list storing clustering data (also see function: \link{clust_matrix})
#'
#' @return Processed data frame storing time series data with added column indicating cluster assignments
#'
#' @import tibble
#' @import dplyr
#'
#' @examples
#' list <- patient_list('.../ts_demofiles1') #Just folder; files can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/ts_demofiles1)
#' #Sampling frequency is supposed to be daily
#' clustering <- clust_matrix(matrix, method = "kmeans", nclust = 3)
#' ts <- add_clust2ts(list, clustering)
#'
#' @export
add_clust2ts <- function(plist, clustdat) {
#Make one data frame out of all dataframe within time series data list
ts_df <- do.call(rbind,plist)
#New list to store the processed data
datalist <- list()
#Get vector from clustdat list where cluster assignments to patients are stored
#(also see function clust_maxtrix())
new <- as.data.frame(clustdat$Cls)
#Add Patient_ID's to extra column (helps for later processes)
new <- tibble::rownames_to_column(new)
#Rename the columns
colnames(new) <- c("Patient_ID","Cluster")
#For-loop for every cluster
for (i in 1:max(new[,"Cluster"])) {
#Filter for current cluster number
compare <- filter(new, Cluster %in% i)
#Remove column "Cluster"
compare[,"Cluster"] <- NULL
#Make a vector, only containing the Patient_ID's for current cluster
compare <- as.character(compare[,1])
#Filter time series data frame for Patient_ID's from current cluster
new2 <- filter(ts_df, Patient_ID %in% compare)
#Add a new column "Cluster" to time series data frame storing cluster number
new2[,"Cluster"] <- i
#Add current cluster time series data to list
datalist[[i]] <- new2
}
#Make one time series data frame out of all list elements
assigned <- do.call(rbind,datalist)
}
#' Observe specific cluster for overview and p-values
#'
#' @param ts.dat Processed data frame storing time series data and cluster assignments (also see function: \link{add_clust2ts})
#' @param enrich Processed data frame storing enrichment data and cluster assignments (also see function: \link{add_clust2enrich})
#' @param clustno Cluster number of interest
#'
#' @return Terminal output presenting summary of time series and enrichment data with corresponding p-values
#'
#' @import arsenal
#' @import dplyr
#'
#' @examples
#' list <- patient_list('.../ts_demofiles1') #Just folder; files can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/ts_demofiles1)
#' #Sampling frequency is supposed to be daily
#' clustering <- clust_matrix(matrix, method = "kmeans", nclust = 3)
#' enr <- add_enrich(list, '.../enrichment_dat.csv') #file can be drawn from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/enrichment)
#' enr <- add_clust2enrich(enr, clustering)
#' ts <- add_clust2ts(list, clustering)
#' enr_obs_clust(ts, enr, 1)
#'
#' @export
enr_obs_clust <- function(ts.dat, enrich, clustno) {
#Methods only applicable on pre-processed enrichment and time series data
#from functions: add_clust2enrich & add_clust2ts
#Remove "Patient_ID" column from enrichment data, otherwise they would be
#recognized as categorial variable for summary
enrich[,"Patient_ID"] <- NULL
#Split the data so that p-values can be calculated
#(in cluster = 0 vs. not in cluster = 1)
enrich_in <- filter(enrich, Cluster == clustno)
enrich_in[,"Cluster"] <- 0
enrich_out <- filter(enrich, Cluster != clustno)
enrich_out[,"Cluster"] <- 1
#Now bind them both together again
enrich_new <- rbind(enrich_in,enrich_out)
#From package "Arsenal"; Creates and calculates a summary of the data by cluster
#By default: p-values for continuous variables are calculated by Mann-Whitney test
#and Hypergeometric test for categorial variable
#The tableby() function creates a list where all the data is stored
table_enr <- tableby(Cluster ~., data = enrich_new)
#Now this list is transformed to a data frame to print easily the intermediary results
table_enr <- as.data.frame(summary(table_enr))
#Remove some reoccuring characters to make it readable
table_enr[,1] <- gsub(" ","",as.character(table_enr[,1]))
#Take out first part of summary, showing the parameters
col1 <- as.data.frame(table_enr[,1])
#Take out second part, showing the data for parameters inside cluster
col2 <- as.data.frame(table_enr[,2])
#Take out third part, showing the corresponding p-values
col3 <- as.data.frame(table_enr[,5])
#Bind the three parts
table_enr <- cbind(col1,col2,col3)
#Assign new colnames
colnames(table_enr) <- c("Parameter"," ","p-value")
#Count how many patients are in currently observed cluster
#Filter for current cluster
n_pat <- filter(enrich, Cluster %in% clustno)
#Length = Number of patients
n_pat <- length(n_pat[,1])
#z-norm the ts-data
#find the Patient_IDs from the cluster of interest
patnames <- filter(ts.dat, Cluster == clustno)
patnames <- names(table(patnames[,"Patient_ID"]))
#find the parameters for cluster of interest
parameters <- colnames(ts.dat)
parameters <- gsub('Patient_ID', NA, parameters)
parameters <- gsub('Time', NA, parameters)
parameters <- gsub('Interpolated', NA, parameters)
parameters <- gsub('Cluster', NA, parameters)
parameters <- na.omit(parameters)
#Start an analogous procedure for time series data
#Remove both Patient_ID and Time so that only time series data, information
#on interpolation and cluster assignment remain
ts.dat[,"Patient_ID"] <- NULL
ts.dat[,"Time"] <- NULL
#Split the data so that p-values can be calculated
#(in cluster = 0 vs. not in cluster = 1)
ts.dat_in <- filter(ts.dat, Cluster == clustno)
ts.dat_in[,"Cluster"] <- 0
ts.dat_out <- filter(ts.dat, Cluster != clustno)
ts.dat_out[,"Cluster"] <- 1
#Now bind them both together again
ts.dat_new <- rbind(ts.dat_in,ts.dat_out)
table_ts <- tableby(Cluster ~., data = ts.dat_new)
#Similar procedure as above (line 978)
table_ts <- as.data.frame(summary(table_ts))
table_ts[,1] <- gsub(" ","",as.character(table_ts[,1]))
cola <- as.data.frame(table_ts[,1])
colb <- as.data.frame(table_ts[,2])
#Add empty columns; The terminal output is then more convenient to read
colc <- as.data.frame(matrix("",ncol = 2, nrow = nrow(cola)))
cold <- as.data.frame(table_ts[,5])
table_ts <- cbind(cola,colb,colc,cold)
colnames(table_ts) <- c("Parameter", " "," ", "z-norm", "p-value")
#add the z-normalized values like
#Loop over each parameter
for (k in 1:length(parameters)) {
#Take the current distribution, meaning for the current parameter inside the cluster
current_distribution <- ts.dat_in[,parameters[k]]
#Z-normalize the chosen distributions
current_distribution <- znorm(current_distribution)
#Calculate the mean of the z-norm. distribution
mn <- as.character(round(mean(current_distribution), digits = 3))
#Calculate the std. deviation of the z-norm. distribution
stnd <- as.character(round(sd(current_distribution), digits = 3))
#Combine them in one character (better to present in the terminal)
mnstnd <- paste0(mn, " (", stnd, ")")
#Add mean and std. dev. in the summary table
table_ts[((3*k)-1),4] <- mnstnd
#Find the max. of the z-norm. distribution
mx <- as.character(round(max(current_distribution), digits = 3))
#Find the min. of the z-norm. distribution
min <- as.character(round(min(current_distribution), digits = 3))
#Combine them in one character (-"-)
mxmin <- paste0(min, " - ",mx)
#Add the range in the summary table
table_ts[(3*k),4] <- mxmin
}
#for summary: number of measurements; number of interpolated data
#Filter time series data for current cluster and determine length (= number of measurements)
n_mes <- filter(ts.dat, Cluster %in% clustno)
n_val <- length(n_mes[,1])
#Filter and determine length for both interpolated and non-interpolated data
n_real <- length(filter(n_mes, Interpolated == FALSE)[,1])
n_inter <- length(filter(n_mes, Interpolated == TRUE)[,1])
#Determine the ratio interpolated vs. non-interpolated data
inter_ratio <- round(100*(n_inter/(n_inter + n_real)), digits = 4)
#Printed feedback on observation of current cluster for user with previously
#prepared data
cat("\n\n", "Summary Cluster No.: ", clustno, "\n\n",
"Number of patients: ", n_pat, "\n", "Number of ts values: ", n_val,
"\n\n", "Real measurements: ", n_real, "\n", "Interpolated data: ",
n_inter, "(", inter_ratio, "%)", "\n\n", "Time series data:","\n")
print(table_ts)
cat("\n\n", "Population data:","\n")
print(table_enr)
}
#' Simulate random sampling for NA entries in enrichment data and check stability of resulting p-values for the enrichment parameters
#'
#' @param plist List storing patient time series data (also see function: \link{patient_list})
#' @param path Path where enrichment csv file is stored
#' @param clustdat Object of type list storing clustering data (also see function: \link{clust_matrix})
#' @param clustno Cluster number of interest
#' @param n_sim Number of simulations
#'
#' @return Object of type list storing the received p-values for each parameter in a vector and boxplot visualizing the received p-values
#'
#' @import arsenal
#' @import dplyr
#'
#' @examples
#' list <- patient_list('.../ts_demofiles1') #Just folder; files can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/ts_demofiles1)
#' #Sampling frequency is supposed to be daily
#' path <- '.../enrichment.csv' #Enrichment file can be pulled from GitHub demo files
#' #(https://github.com/MrMaximumMax/FBCanalysis/tree/master/demo_and_testfiles/enrichment)
#' test <- sim_sample_enr(list,path,clustering,1,100)
#' sim_sample_enr <- function(plist, path, clustdat, clustno, n_sim)
#'
#' @export
sim_sample_enr <- function(plist, path, clustdat, clustno, n_sim) {
#Analogous start as in function: add_enrich()
#Read csv-file on indicated path; empty fields in csv-file are filled up as NA
dat <- read.csv(path, na.strings=c("","NA"))
#Print first 5 lines of csv and present to user
cat("\n\n") #two empty lines as space before printing
print(head(dat[1:5,]))
cat("\n\n") #two empty lines after printing
#Create a data frame with the colnmaes and assigned ascending numbers to the colnames
feedback <- matrix(nrow = 2, ncol = length(colnames(dat)))
feedback[1,] <- colnames(dat)
feedback[2,] <- 1:length(dat)
feedback <- as.data.frame(feedback)
#Print the created feedback data frame
print(feedback, row.names=F, col.names=F, quote=F)
cat("\n")
#Ask the user which number (= which column) represents the Patient_ID
#This is of interest since later functionalities recognize the Patient_ID and
#do not consider them as a population parameter itself
question1 <- as.numeric(readline("Which column represents the Patient_ID (please indicate number)?: "))
#Rename column as "Patient_ID" so that later functionalities always recognize
#the patient column
names(dat)[names(dat) == feedback[1,question1]] <- 'Patient_ID'
#The enrichment data frame should be so prepared that Patient_ID's that appear in
#the indicated csv-file but not in the patient data list are thrown out and
#Patient_ID's that appear in the patient data list but not in the csv-file are
#added to the data frame and every parameter is filled with NA
#Get Patient_ID's from both previously loaded csv-file and patient time
#series data list
names_enr <- as.character(as.data.frame(table(dat[,'Patient_ID']))[,1])
names_ts <- names(plist)
#Check which patients are in enrich but not ts and throw them out
throw <- as.vector(dplyr::setdiff(names_enr,names_ts))
#Throw out Patient_ID's that do not appear in time series data list
dat_new1 <- filter(dat, !Patient_ID %in% throw)
#Check which patients are in ts but not enrich and add them with NAs
add <- dplyr::setdiff(names_ts,names_enr)
#In case there are Patient_ID's to add to the enrichment data frame
if (length(add > 0)) {
#Make a new matrix with same number of columns and colnames than csv-file
#Number of rows are number of Patient_ID's to be added
dat_new2 <- matrix(data = NA, nrow = length(add), ncol = length(colnames(dat)))
#Transform matrix to data frame
dat_new2 <- as.data.frame(dat_new2)
#Change the colnames to the colnames of the csv-file
colnames(dat_new2) <- colnames(dat)
#Add missing Patient_ID's to the Patient_ID column
dat_new2[,"Patient_ID"] <- add
#Rowbind the previously created data frame with the original enrichment data frame
dat_new <- rbind(dat_new1,dat_new2)
} else {
#In case there are no Patient_ID's to add, the previous data frame can be taken over
dat_new <- dat_new1
}
#Add cluster assignments to enrichment data
dat_new <- add_clust2enrich(dat_new,clustdat)
#Make a new list to store the p-values for each parameter from enrichment data
newlist <- list()
#Initialize progress bar
pb <- txtProgressBar(min = 0, max = n_sim, style = 3)
#For loop for each simulation
for (k in 1:n_sim) {
#Update progress bar
setTxtProgressBar(pb, k)
#dat_new_add is always taken for each simulation to sample values for the NA
#cells and then determine the respective p-values for the parameters
dat_new_add <- dat_new
#Go through every column and row and find cells with NA
for (i in 1:ncol(dat_new_add)) {
for (j in 1:nrow(dat_new_add)) {
#In case there is NA in current cell
if (is.na(dat_new_add[j,i])) {
#Take all values from current column where NA is found but disregard NAs
samp <- na.omit(dat_new_add[,i])
#Sample randomly for current cell
dat_new_add[j,i] <- sample(samp,1)
}
}
}
#Remove "Patient_ID" column from enrichment data, otherwise they would be
#recognized as categorial variable for summary
dat_new_add[,"Patient_ID"] <- NULL
#Split the data so that p-values can be calculated
#(in cluster = 0 vs. not in cluster = 1)
enrich_in <- filter(dat_new_add, Cluster == clustno)
enrich_in[,"Cluster"] <- 0
enrich_out <- filter(dat_new_add, Cluster != clustno)
enrich_out[,"Cluster"] <- 1
#Now bind them both together again
enrich_new <- rbind(enrich_in,enrich_out)
#From package "Arsenal"; Creates and calculates a summary of the data by cluster
#By default: p-values for continuous variables are calculated by Mann-Whitney test
#and Hypergeometric test for categorial variable
#The tableby() function creates a list where all the data is stored
table_enr <- tableby(Cluster ~., data = enrich_new)
#Now this list is transformed to a data frame to print easily the intermediary results
table_enr <- as.data.frame(summary(table_enr))
#Remove some reoccuring characters to make it readable
table_enr[,1] <- gsub(" ","",as.character(table_enr[,1]))
#Take out first part of summary, showing the parameters
col1 <- as.data.frame(table_enr[,1])
#Take out second part, showing the data for parameters inside cluster
col2 <- as.data.frame(table_enr[,2])
#Take out third part, showing the corresponding p-values
col3 <- as.data.frame(table_enr[,5])
#Bind the three parts
table_enr <- cbind(col1,col2,col3)
#Assign new colnames
colnames(table_enr) <- c("Parameter"," ","p-value")
#Modify the data frame so that first column stores the parameter and the second
#column the corresponding p-value
table_enr[,2] <- NULL
table_enr[ table_enr == "" ] <- NA
table_enr <- table_enr[complete.cases(table_enr),]
#For each parameter from the data frame
for (l in 1:nrow(table_enr)) {
#extract the corresponding p-value and store for k'th run of simulation
#in list
#Check if value = "< 0.001", replace with "0.001", otherwise it is not
#possible to make a boxplot; Else just take the received value
if (table_enr[l,2] == "< 0.001") {
newlist[[table_enr[l,1]]][k] <- 0.001
} else {
newlist[[table_enr[l,1]]][k] <- as.numeric(table_enr[l,2])
}
}
}
#Boxplot for sampling results
boxplot(newlist, main = paste("Enrichment sampling / Cluster No: ", clustno,
"/ n_simulations = ", n_sim), ylab = "p-value")
newlist
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.