Nothing
#' Get data for calculating distance matrix from phylogenetic profiles
#' @export
#' @usage getDataClustering(data, profileType = "binary", var1AggBy = "max",
#' var2AggBy = "max")
#' @param data a data frame contains processed and filtered profiles (see
#' ?fullProcessedProfile and ?filterProfileData, ?fromInputToProfile)
#' @param profileType type of data used for calculating the distance matrix.
#' Either "binary" (consider only the presence/absence status of orthlogs), or
#' "var1"/"var2" for taking values of the additional variables into account.
#' Default = "binary".
#' @param var1AggBy aggregate method for VAR1 (min, max, mean or median).
#' Default = "max".
#' @param var2AggBy aggregate method for VAR2 (min, max, mean or median).
#' Default = "max".
#' @return A wide dataframe contains values for calculating distance matrix.
#' @author Carla Mölbert (carla.moelbert@gmx.de), Vinh Tran
#' (tran@bio.uni-frankfurt.de)
#' @seealso \code{\link{fromInputToProfile}}
#' @examples
#' data("superTaxonProfile", package="PhyloProfile")
#' data <- superTaxonProfile
#' profileType <- "binary"
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "mean"
#' getDataClustering(data, profileType, var1AggregateBy, var2AggregateBy)
getDataClustering <- function(
data = NULL, profileType = "binary", var1AggBy = "max", var2AggBy = "max"
) {
if (is.null(data)) stop("Input data cannot be NULL!")
supertaxon <- presSpec <- NULL
# remove lines where there is no found ortholog
subDataHeat <- subset(data, data$presSpec > 0)
# transform data into wide matrix
if (profileType == "binary") {
subDataHeat <- subDataHeat[, c("geneID", "supertaxon", "presSpec")]
subDataHeat$presSpec[subDataHeat$presSpec > 0] <- 1
subDataHeat <- subDataHeat[!duplicated(subDataHeat), ]
wideData <- data.table::dcast(
data.table::setDT(subDataHeat),
geneID ~ supertaxon, value.var = "presSpec")
} else {
var <- profileType
subDataHeat <- subDataHeat[, c("geneID", "supertaxon", var)]
subDataHeat <- subDataHeat[!duplicated(subDataHeat), ]
# aggreagte the values by the selected method
if (var == "var1") aggregateBy <- var1AggBy
else aggregateBy <- var2AggBy
subDataHeat <- stats::aggregate(
subDataHeat[, var],
list(subDataHeat$geneID, subDataHeat$supertaxon),
FUN = aggregateBy
)
colnames(subDataHeat) <- c("geneID", "supertaxon", var)
wideData <- data.table::dcast(
data.table::setDT(subDataHeat),
geneID ~ supertaxon, value.var = var)
}
# set name for wide matrix as gene IDs
wideData <- as.data.frame(wideData)
dat <- wideData[, 2:ncol(wideData)]
rownames(dat) <- wideData[, 1]
dat[is.na(dat)] <- 0
return(dat)
}
#' Calculate the distance matrix
#' @export
#' @param profiles dataframe contains profile data for distance calculating
#' (see ?getDataClustering)
#' @param method distance calculation method ("euclidean", "maximum",
#' "manhattan", "canberra", "binary", "distanceCorrelation",
#' "mutualInformation" or "pearson" for binary data; "distanceCorrelation" or
#' "mutualInformation" for non-binary data). Default = "mutualInformation".
#' @return A calculated distance matrix for input phylogenetic profiles.
#' @importFrom bioDist mutualInfo
#' @importFrom bioDist cor.dist
#' @author Carla Mölbert (carla.moelbert@gmx.de), Vinh Tran
#' (tran@bio.uni-frankfurt.de)
#' @seealso \code{\link{getDataClustering}}
#' @examples
#' data("finalProcessedProfile", package="PhyloProfile")
#' data <- finalProcessedProfile
#' profileType <- "binary"
#' profiles <- getDataClustering(
#' data, profileType, var1AggregateBy, var2AggregateBy)
#' method <- "mutualInformation"
#' getDistanceMatrix(profiles, method)
getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
if (is.null(profiles)) stop("Profile data cannot be NULL!")
profiles <- profiles[, colSums(profiles != 0) > 0]
profiles <- profiles[rowSums(profiles != 0) > 0, ]
distMethods <- c("euclidean", "maximum", "manhattan", "canberra", "binary")
if (method %in% distMethods) {
distanceMatrix <- stats::dist(profiles, method = method)
} else if (method == "distanceCorrelation") {
n <- seq_len(nrow(profiles))
matrix <- matrix(0L, nrow = nrow(profiles), ncol = nrow(profiles))
for (i in seq_len(nrow(profiles))) { # rows
p_i <- unlist(profiles[i,])
for (j in seq_len(nrow(profiles))) { # columns
if (i == j) break
matrix[i, j] <- dcor(p_i, unlist(profiles[j,]))
}
}
# Swich the value so that the profiles with a high correlation
# are clustered together
matrix <- 1 - matrix
matrix <- as.data.frame(matrix)
profileNames <- rownames(profiles)
colnames(matrix) <- profileNames[seq_len(length(profileNames)) - 1]
rownames(matrix) <- profileNames
distanceMatrix <- stats::as.dist(matrix)
} else if (method == "mutualInformation") {
distanceMatrix <- bioDist::mutualInfo(as.matrix(profiles))
distanceMatrix <- max(distanceMatrix, na.rm = TRUE) - distanceMatrix
} else if (method == "pearson") {
distanceMatrix <- bioDist::cor.dist(as.matrix(profiles))
}
return(distanceMatrix)
}
#' Create a hclust object from the distance matrix
#' @export
#' @param distanceMatrix calculated distance matrix (see ?getDistanceMatrix)
#' @param clusterMethod clustering method ("single", "complete",
#' "average" for UPGMA, "mcquitty" for WPGMA, "median" for WPGMC,
#' or "centroid" for UPGMC). Default = "complete".
#' @return An object class hclust generated based on input distance matrix and
#' a selected clustering method.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @seealso \code{\link{getDataClustering}},
#' \code{\link{getDistanceMatrix}}, \code{\link{hclust}}
#' @examples
#' data("finalProcessedProfile", package="PhyloProfile")
#' data <- finalProcessedProfile
#' profileType <- "binary"
#' profiles <- getDataClustering(
#' data, profileType, var1AggregateBy, var2AggregateBy)
#' distMethod <- "mutualInformation"
#' distanceMatrix <- getDistanceMatrix(profiles, distMethod)
#' clusterMethod <- "complete"
#' clusterDataDend(distanceMatrix, clusterMethod)
clusterDataDend <- function(distanceMatrix = NULL, clusterMethod = "complete") {
if (is.null(distanceMatrix)) stop("Distance matrix cannot be NULL!")
dd.col <- stats::hclust(distanceMatrix, method = clusterMethod)
return(dd.col)
}
#' Plot dendrogram tree
#' @export
#' @param dd dendrogram object (see ?clusterDataDend)
#' @return A dendrogram plot for the genes in the input phylogenetic profiles.
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
#' @seealso \code{\link{clusterDataDend}}
#' @examples
#' data("finalProcessedProfile", package="PhyloProfile")
#' data <- finalProcessedProfile
#' profileType <- "binary"
#' profiles <- getDataClustering(
#' data, profileType, var1AggregateBy, var2AggregateBy)
#' distMethod <- "mutualInformation"
#' distanceMatrix <- getDistanceMatrix(profiles, distMethod)
#' clusterMethod <- "complete"
#' dd <- clusterDataDend(distanceMatrix, clusterMethod)
#' getDendrogram(dd)
getDendrogram <- function(dd = NULL) {
if (is.null(dd)) stop("Input dendrogram cannot be NULL!")
p <- ape::plot.phylo(as.phylo(dd))
return(p)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.