Nothing
#' @title Plot static parallel coordinate clusters
#'
#' @description Perform hierarchical clustering analysis and visualize
#' results with parallel coordinate plots. Optionally, save gene IDs within
#' each cluster to .rds files for later use.
#'
#' @param data DATA FRAME | Read counts
#' @param dataMetrics LIST | Differential expression metrics; default NULL
#' @param dataSE SUMMARIZEDEXPERIMENT | Summarized experiment format that
#' can be used in lieu of data and dataMetrics; default NULL
#' @param geneList CHARACTER ARRAY | Array of ID values of genes to be drawn
#' from data as parallel coordinate lines. Use this parameter if you have
#' predetermined genes to be drawn. These genes will be clustered. Otherwise,
#' use dataMetrics, threshVar, and threshVal to create clusters to be
#' overlaid as parallel coordinate lines; default NULL. See package website
#' for examples
#' @param geneLists LIST | List of ID values of genes already clustered to be
#' drawn from data as parallel coordinate lines. Each list item is an array
#' of genes ID values that are already grouped as a cluster. Unlike the
#' singular geneList object, the plural geneLists object is not be clustered.
#' If you instead wish to cluster genes, use dataMetrics, threshVar, and
#' threshVal or geneList to create clusters to be overlaid as parallel
#' coordinate lines; default NULL. See package website for examples
#' @param threshVar CHARACTER STRING | Name of column in dataMetrics object
#' that is used to threshold significance; default "FDR"
#' @param threshVal INTEGER | Maximum value to threshold significance from
#' threshVar object; default 0.05
#' @param clusterAllData BOOLEAN [TRUE | FALSE] | Create clusters based on
#' the whole dataset and then assign significant genes to those clusters;
#' default is TRUE. If FALSE, create clusters based on just the significant
#' genes. With either option, the side-by-side boxplot will represent the
#' whole dataset (from data input) and the parallel coordinate lines will
#' represent only the significant genes (those that pass threshVal for
#' threshVar)
#' @param showPairs BOOLEAN [TRUE | FALSE] | When more than three treatment
#' groups are present, for each pairwise comparison, show only the results
#' for that pair of treatment groups; default is TRUE. If FALSE, show results
#' for all treatment groups even though clusters and significance are
#' determined in pairwise fashion. Note this parameter will not make a
#' difference when the data only contains two treatment groups
#' @param nC INTEGER | Number of clusters; default 4
#' @param colList CHARACTER ARRAY | List of colors for each cluster; default
#' is rainbow(nC)
#' @param aggMethod CHARACTER STRING ["ward.D" | "ward.D2" | "single" |
#' "complete" | "average" | "mcquitty" | "median" | "centroid"] | The
#' agglomeration method to be used in the hierarchical clustering; default
#' "ward.D"
#' @param yAxisLabel CHARACTER STRING | Vertical axis label; default "Count"
#' @param xAxisLabel CHARACTER STRING | Horizontal axis label; default
#' "Sample"
#' @param lineSize INTEGER | Size of plotted parallel coordinate lines;
#' default 0.1
#' @param lineAlpha INTEGER | Alpha value of plotted parallel coordinate
#' lines, default 0.5
#' @param vxAxis BOOLEAN [TRUE | FALSE] | Flip x-axis text labels to vertical
#' orientation; default FALSE
#' @param outDir CHARACTER STRING | Output directory to save all images;
#' default tempdir()
#' @param saveFile BOOLEAN [TRUE | FALSE] | Save file to outDir; default TRUE
#' @param verbose BOOLEAN [TRUE | FALSE] | Print each cluster from each
#' cluster size into separate files and print the associated IDs of each
#' cluster from each cluster size into separate .rds files; default is FALSE
#' @importFrom dplyr %>%
#' @importFrom tidyr gather
#' @importFrom ggplot2 ggplot element_text
#' @importFrom gridExtra arrangeGrob
#' @importFrom reshape melt
#' @importFrom grDevices rainbow
#' @importFrom graphics plot
#' @importFrom utils write.table
#' @importFrom grid grid.draw
#' @importFrom utils combn
#' @importFrom stats setNames
#' @seealso
#' \code{\link[stats]{hclust}}
#' \url{https://lindsayrutter.github.io/bigPint/articles/clusters.html}
#' @return List of n elements each containing a grid of parallel coordinate
#' plots, where n is the number of treatment pair combinations in the data
#' object. If the saveFile parameter has a value of TRUE, then each grid of
#' parallel coordinate plots is saved to the location specified in the outDir
#' parameter as a JPG file. If the verbose parameter has a value of TRUE,
#' then a JPG file for each parallel coordinate plot in each grid, RDS file
#' containing the superimposed IDs for each parallel coordinate plot in each
#' grid, and the JPG file of each grid of parallel coordinate plots is saved
#' to the location specified in the outDir parameter.
#' @export
#' @examples
#' # The first set of five examples use data and dataMetrics
#' # objects as input. The last set of five examples create the same plots now
#' # using the SummarizedExperiment (i.e. dataSE) object input.
#'
#' # Example 1: Perform hierarchical clustering of size four using the
#' # default agglomeration method "ward.D". Cluster only on the genes that have
#' # FDR < 1e-7 (n = 113) and overlay these genes.
#'
#' library(grid)
#' library(matrixStats)
#' library(ggplot2)
#' data(soybean_ir_sub)
#' soybean_ir_sub[,-1] <- log(soybean_ir_sub[-1]+1)
#' data(soybean_ir_sub_metrics)
#' colList = c("#00A600FF", rainbow(5)[c(1,4,5)])
#' ret <- plotClusters(data=soybean_ir_sub,
#' dataMetrics = soybean_ir_sub_metrics, nC=4, colList = colList,
#' clusterAllData = FALSE, threshVal = 1e-7, saveFile = FALSE)
#' grid.draw(ret[["N_P_4"]])
#'
#' # Example 2: Perform the same analysis, only now create the four groups by
#' # clustering on all genes in the data (n = 5,604). Then, overlay the genes
#' # that have FDR < 1e-7 (n = 113) into their corresponding clusters.
#'
#' ret <- plotClusters(data=soybean_ir_sub,
#' dataMetrics = soybean_ir_sub_metrics, nC=4, colList = colList,
#' clusterAllData = TRUE, threshVal = 1e-7, saveFile = FALSE)
#' grid.draw(ret[["N_P_4"]])
#'
#' # Example 3: Perform the same analysis, only now overlay all genes in the
#' # data by keeping the dataMetrics object as its default value of NULL.
#'
#' ret <- plotClusters(data=soybean_ir_sub, nC=4, colList = colList,
#' clusterAllData = TRUE, saveFile = FALSE)
#' grid.draw(ret[["N_P_4"]])
#'
#' # Example 4: Visualization of gene clusters is usually performed on
#' # standardized data. Here, hierarchical clustering of size four is performed
#' # using the agglomeration method "average" on standardized data. Only genes
#' # with FDR < 0.05 are used for the clustering. Only two of the three
#' # pairwise combinations of treatment groups (S1 and S2; S1 and S3) have any
#' # genes with FDR < 0.05. The output plots for these two pairs are examined.
#'
#' data(soybean_cn_sub)
#' data(soybean_cn_sub_metrics)
#' soybean_cn_sub_st <- as.data.frame(t(apply(as.matrix(soybean_cn_sub[,-1]),
#' 1, scale)))
#' soybean_cn_sub_st$ID <- as.character(soybean_cn_sub$ID)
#' soybean_cn_sub_st <- soybean_cn_sub_st[,c(length(soybean_cn_sub_st),
#' 1:length(soybean_cn_sub_st)-1)]
#' colnames(soybean_cn_sub_st) <- colnames(soybean_cn_sub)
#' nID <- which(is.nan(soybean_cn_sub_st[,2]))
#' soybean_cn_sub_st[nID,2:length(soybean_cn_sub_st)] <- 0
#' ret <- plotClusters(data=soybean_cn_sub_st,
#' dataMetrics = soybean_cn_sub_metrics, nC=4,
#' colList = c("#00A600FF", "#CC00FFFF", "red", "darkorange"),
#' lineSize = 0.5, lineAlpha = 1, clusterAllData = FALSE,
#' aggMethod = "average", yAxisLabel = "Standardized read count",
#' saveFile = FALSE)
#' names(ret)
#' grid.draw(ret[["S1_S2_4"]])
#' grid.draw(ret[["S1_S3_4"]])
#'
#' # Example 5: Run the same analysis, only now set the verbose parameter to
#' # value TRUE. This will save images of each individual cluster, .rds files
#' # that contain the IDs within each cluster, and images of the conglomerate
#' # clusters to outDir (default tempdir()).
#'
#' \dontrun{
#' plotClusters(data=soybean_cn_sub_st, dataMetrics = soybean_cn_sub_metrics,
#' nC=4, colList = c("#00A600FF", "#CC00FFFF", "red", "darkorange"),
#' lineSize = 0.5, lineAlpha = 1, clusterAllData = FALSE,
#' aggMethod = "average", yAxisLabel = "Standardized read count",
#' verbose = TRUE)
#' }
#'
#' # Below are the same five examples, only now using the
#' # SummarizedExperiment (i.e. dataSE) object as input.
#'
#' # Example 1: Perform hierarchical clustering of size four using the
#' # default agglomeration method "ward.D". Cluster only on the genes that have
#' # FDR < 1e-7 (n = 113) and overlay these genes.
#'
#' \dontrun{
#' library(grid)
#' library(matrixStats)
#' library(ggplot2)
#' data(se_soybean_ir_sub)
#' assay(se_soybean_ir_sub) <- log(as.data.frame(assay(se_soybean_ir_sub))+1)
#' colList = c("#00A600FF", rainbow(5)[c(1,4,5)])
#' ret <- plotClusters(dataSE=se_soybean_ir_sub, nC=4, colList = colList,
#' clusterAllData = FALSE, threshVal = 1e-7, saveFile = FALSE)
#' grid.draw(ret[["N_P_4"]])
#' }
#'
#' \dontrun{
#' # Example 2: Perform the same analysis, only now create the four groups by
#' # clustering on all genes in the data (n = 5,604). Then, overlay the genes
#' # that have FDR < 1e-7 (n = 113) into their corresponding clusters.
#'
#' ret <- plotClusters(dataSE=se_soybean_ir_sub, nC=4, colList = colList,
#' clusterAllData = TRUE, threshVal = 1e-7, saveFile = FALSE)
#' grid.draw(ret[["N_P_4"]])
#' }
#'
#' # Example 3: Perform the same analysis, only now overlay all genes in the
#' # data by setting the rowData() to NULL.
#'
#' \dontrun{
#' se_soybean_ir_sub_nm <- se_soybean_ir_sub
#' rowData(se_soybean_ir_sub_nm) <- NULL
#' ret <- plotClusters(dataSE=se_soybean_ir_sub_nm, nC=4, colList = colList,
#' clusterAllData = TRUE, saveFile = FALSE)
#' grid.draw(ret[["N_P_4"]])
#' }
#'
#' # Example 4: Visualization of gene clusters is usually performed on
#' # standardized data. Here, hierarchical clustering of size four is performed
#' # using the agglomeration method "average" on standardized data. Only genes
#' # with FDR < 0.05 are used for the clustering. Only two of the three
#' # pairwise combinations of treatment groups (S1 and S2; S1 and S3) have any
#' # genes with FDR < 0.05. The output plots for these two pairs are examined.
#'
#' \dontrun{
#' data(se_soybean_cn_sub)
#' se_soybean_cn_sub_st = se_soybean_cn_sub
#' assay(se_soybean_cn_sub_st) <-as.data.frame(t(apply(as.matrix(as.data.frame(
#' assay(se_soybean_cn_sub))), 1, scale)))
#' nID <- which(is.nan(as.data.frame(assay(se_soybean_cn_sub_st))[,1]))
#' assay(se_soybean_cn_sub_st)[nID,] <- 0
#' ret <- plotClusters(dataSE=se_soybean_cn_sub_st, nC=4,
#' colList = c("#00A600FF", "#CC00FFFF", "red", "darkorange"),
#' lineSize = 0.5, lineAlpha = 1, clusterAllData = FALSE,
#' aggMethod = "average", yAxisLabel = "Standardized read count",
#' saveFile = FALSE)
#' names(ret)
#' grid.draw(ret[["S1_S2_4"]])
#' grid.draw(ret[["S1_S3_4"]])
#' }
#'
#' # Example 5: Run the same analysis, only now set the verbose parameter to
#' # value TRUE. This will save images of each individual cluster, .rds files
#' # that contain the IDs within each cluster, and images of the conglomerate
#' # clusters to outDir (default tempdir()).
#'
#' \dontrun{
#' plotClusters(dataSE=se_soybean_cn_sub_st, nC=4,
#' colList = c("#00A600FF", "#CC00FFFF", "red", "darkorange"),
#' lineSize = 0.5, lineAlpha = 1, clusterAllData = FALSE,
#' aggMethod = "average", yAxisLabel = "Standardized read count",
#' verbose = TRUE)
#' }
#'
#'
plotClusters <- function(data, dataMetrics = NULL, dataSE=NULL, geneList = NULL,
geneLists = NULL, threshVar="FDR", threshVal=0.05, clusterAllData = TRUE,
showPairs = TRUE, nC = 4, colList = rainbow(nC),
aggMethod = c("ward.D", "ward.D2", "single", "complete", "average",
"mcquitty", "median", "centroid"), yAxisLabel = "Count", xAxisLabel =
"Sample", lineSize = 0.1, lineAlpha = 0.5, vxAxis = FALSE,
outDir=tempdir(), saveFile = TRUE, verbose=FALSE){
aggMethod <- match.arg(aggMethod)
if (is.null(dataSE) && is.null(data)){
helperTestHaveData()
}
if (!is.null(dataSE)){
#Reverse engineer data
data <- helperGetData(dataSE)
if (ncol(rowData(dataSE))>0){
#Reverse engineer dataMetrics
reDataMetrics <- as.data.frame(rowData(dataSE))
dataMetrics <- lapply(split.default(reDataMetrics[-1],
sub("\\..*", "",names(reDataMetrics[-1]))), function(x)
cbind(reDataMetrics[1], setNames(x, sub(".*\\.", "", names(x)))))
for (k in seq_len(length(dataMetrics))){
colnames(dataMetrics[[k]])[1] = "ID"
}
}
}
# Check that input parameters fit required formats
helperTestData(data)
if (is.null(geneList) && !is.null(dataMetrics)){
helperTestDataMetrics(data, dataMetrics, threshVar)
}
key <- val <- ID <- rainbow <- NULL
myPairs <- helperMakePairs(data)[["myPairs"]]
colGroups <- helperMakePairs(data)[["colGroups"]]
cols.combn <- combn(myPairs, 2, simplify = FALSE)
if (!is.null(geneLists)){
nC = length(geneLists)
colList = rainbow(nC)
seqVec = seq(nC)
}
ret <- lapply(cols.combn, function(x){
group1 = x[1]
group2 = x[2]
fData <- cbind(ID=data$ID, data[,which(colGroups %in%
c(group1, group2))])
if (showPairs){
boxDat <- fData %>% gather(key, val, c(-ID))
colnames(boxDat) <- c("ID", "Sample", "Count")
userOrder <- unique(boxDat$Sample)
boxDat$Sample <- as.factor(boxDat$Sample)
levels(boxDat$Sample) <- userOrder
}
else{
boxDat <- data %>% gather(key, val, c(-ID))
colnames(boxDat) <- c("ID", "Sample", "Count")
userOrder <- unique(boxDat$Sample)
boxDat$Sample <- as.factor(boxDat$Sample)
levels(boxDat$Sample) <- userOrder
}
plotName <- paste0(group1,"_",group2)
if (!is.null(dataMetrics) && is.null(geneList)){
metricPair = dataMetrics[[paste0(group1,"_",group2)]]
metricPair = metricPair[order(metricPair[threshVar]),]
threshID <- metricPair[which(metricPair[threshVar] <=
threshVal),]$ID
cData <- fData[which(fData$ID %in% threshID),]
}
else if (!is.null(geneList)){
cData <- fData[which(fData$ID %in% geneList),]
}
else{
cData <- fData
}
if (!is.null(geneLists)){
nC = length(geneLists)
colList = rainbow(nC)
seqVec = seq(nC)
plot_clusters = lapply(seq_along(seqVec), function(j){
xSigNames = geneLists[[j]]
xSig = fData %>% filter(ID %in% xSigNames)
nGenes = nrow(xSig)
pcpDat <- melt(xSig, id.vars="ID")
colnames(pcpDat) <- c("ID", "Sample", "Count")
pcpDat$Sample <- as.character(pcpDat$Sample)
pcpDat$ID <- as.factor(pcpDat$ID)
p <- ggplot(boxDat, aes_string(x = 'Sample', y = 'Count')) + geom_boxplot() + geom_line(data=pcpDat, aes_string(x = 'Sample', y = 'Count', group = 'ID'), colour = colList[j], alpha=lineAlpha, size = lineSize) + ylab(yAxisLabel) + xlab(xAxisLabel) + ggtitle(paste("Cluster ", j, " Genes (n=", format(nGenes, big.mark=",", scientific=FALSE), ")", sep="")) + theme(plot.title = element_text(hjust = 0.5, size=14, face="plain"), axis.text=element_text(size=11), axis.title=element_text(size=14))
p
})
p = arrangeGrob(grobs=plot_clusters, ncol=2)
if (saveFile == TRUE || verbose == TRUE){
fileName = paste(outDir, "/", plotName, "_", nC, ".jpg", sep="")
jpeg(fileName)
grid.draw(p)
invisible(dev.off())}
return(p)
}
# geneList variable takes precedence. Create metricPair to only
# select geneList IDs
if(!is.null(geneList)){
metricPair = data.frame(ID = data$ID, FDR = 1)
metricPair$ID = as.character(metricPair$ID)
metricPair[which(metricPair$ID %in% geneList), ]$FDR = 0.1
threshVar = "FDR"
threshVal = 0.5
dataMetrics = 1
}
# Check if there are even any genes that pass the threshold.
# Then, perform clustering on whole dataset and assigned
# significant genes to those clusters from the full dataset.
if (nrow(data)>=nC && clusterAllData == TRUE && showPairs == TRUE){
p <- helperCadTRUEPair(data, dataMetrics, metricPair, aggMethod,
nC, threshVar, threshVal, verbose, vxAxis, saveFile, boxDat,
xAxisLabel, yAxisLabel, lineAlpha, lineSize, plotName,
outDir, colList)
return(p)
}
else if (nrow(data)>=nC && clusterAllData == TRUE && showPairs == FALSE){
p <- helperCadTRUE(data, dataMetrics, metricPair, aggMethod,
nC, threshVar, threshVal, verbose, vxAxis, saveFile, boxDat,
xAxisLabel, yAxisLabel, lineAlpha, lineSize, plotName,
outDir, colList)
return(p)
}
# Check if there are even any genes that pass the threshold. Then,
# perform clustering on just the significant genes.
if (nrow(cData)>=nC && clusterAllData == FALSE && showPairs == TRUE){
p <- helperCadFALSEPair(cData, dataMetrics, metricPair,
aggMethod, nC, threshVar, threshVal, verbose, vxAxis,
saveFile, boxDat, xAxisLabel, yAxisLabel, lineAlpha, lineSize,
plotName, outDir, colList)
return(p)
}
else if (nrow(cData)>=nC && clusterAllData == FALSE && showPairs == FALSE){
p <- helperCadFALSE(data, cData, dataMetrics, metricPair,
aggMethod, nC, threshVar, threshVal, verbose, vxAxis,
saveFile, boxDat, xAxisLabel, yAxisLabel, lineAlpha, lineSize,
plotName, outDir, colList)
return(p)
}
# Indicate if no significant genes existed
if (nrow(data)<nC && clusterAllData == TRUE){
print("Not enough data to cluster by that many groups")
}
# Indicate if not enough significant genes existed
if (nrow(cData)==0){
print(paste0(group1, "_", group2,
": There were no significant genes"))
}
else if (nrow(cData)<nC && clusterAllData == FALSE){
print(paste0(group1, "_", group2,
": Not enough significant genes (" ,nrow(cData),
",) to cluster by that many groups (", nC, ")"))
}
})
plotNames <- lapply(cols.combn, function(x){
group1 = x[1]
group2 = x[2]
paste0(group1,"_",group2,"_",nC)
})
names(ret) <- plotNames
invisible(ret)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.