#' Density-based clustering with DBSCAN
#'
#' Perform density-based clustering with a fast re-implementation of the DBSCAN algorithm.
#'
#' @inheritParams clusterRows
#' @param eps Numeric scalar specifying the distance to use to define neighborhoods.
#' If \code{NULL}, this is determined from \code{min.pts} and \code{core.prop}.
#' @param min.pts Integer scalar specifying the minimum number of neighboring observations required for an observation to be a core point.
#' @param core.prop Numeric scalar specifying the proportion of observations to treat as core points.
#' This is only used when \code{eps=NULL}, see Details.
#' @param chunk.size Integer scalar specifying the number of points to process per chunk.
#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying the algorithm to use for the neighbor searches.
#' This should be able to support both nearest-neighbor and range queries.
#' @param num.threads Integer scalar specifying the number of threads to use.
#' @param BPPARAM Deprecated and ignored, use \code{num.threads} instead.
#' @param full Logical scalar indicating whether additional statistics should be returned.
#'
#' @details
#' DBSCAN operates by identifying core points, i.e., observations with at least \code{min.pts} neighbors within a distance of \code{eps}.
#' It identifies which core points are neighbors of each other, one \code{chunk.size} at a time, forming components of connected core points.
#' All non-core points are then connected to the closest core point within \code{eps}.
#' All groups of points that are connected in this manner are considered to be part of the same cluster.
#' Any unconnected non-core points are treated as noise and reported as \code{NA}.
#'
#' As a suitable value of \code{eps} may not be known beforehand, we can automatically determine it from the data.
#' For all observations, we compute the distance to the \eqn{k}th neighbor where \eqn{k} is defined as \code{round(min.pts * core.prop)}.
#' We then define \code{eps} as the \code{core.prop} quantile of the distances across all observations.
#' The default of \code{core.prop=0.5} means that around half of the observations will be treated as core points.
#'
#' Larger values of \code{eps} will generally result in fewer observations classified as noise, as they are more likely to connect to a core point.
#' It may also promote agglomeration of existing clusters into larger entities if they are connected by regions of (relatively) low density.
#' Conversely, larger values of \code{min.pts} will generally increase the number of noise points and may fragment larger clusters into subclusters.
#'
#' To modify an existing DbscanParam object \code{x},
#' users can simply call \code{x[[i]]} or \code{x[[i]] <- value} where \code{i} is any argument used in the constructor.
#'
#' @return
#' The \code{DbscanParam} constructor will return a \linkS4class{DbscanParam} object with the specified parameters.
#'
#' The \code{clusterRows} method will return a factor of length equal to \code{nrow(x)} containing the cluster assignments.
#' Note that this may contain \code{NA} values corresponding to noise points.
#' If \code{full=TRUE}, a list is returned with \code{clusters} (the factor, as above) and \code{objects}
#' (a list containing the \code{eps} and \code{min.pts} used in the analysis).
#'
#' @examples
#' clusterRows(iris[,1:4], DbscanParam())
#' clusterRows(iris[,1:4], DbscanParam(core.prop=0.8))
#'
#' @references
#' Ester M et al. (1996).
#' A density-based algorithm for discovering clusters in large spatial databases with noise.
#' \emph{Proceedings of the Second International Conference on Knowledge Discovery and Data Mining}, 226-231.
#'
#' @author Aaron Lun
#' @name DbscanParam-class
#' @docType class
#' @aliases
#' show,DbscanParam-method
NULL
#' @export
setClass(
"DbscanParam",
contains="BlusterParam",
slots=c(
eps="numeric_OR_NULL",
min.pts="integer",
core.prop="numeric",
chunk.size="integer",
BNPARAM="BiocNeighborParam",
num.threads="integer"
)
)
#' @export
#' @rdname DbscanParam-class
DbscanParam <- function(eps=NULL, min.pts=5, core.prop=0.5, chunk.size=1000, BNPARAM=KmknnParam(), num.threads=1, BPPARAM=NULL) {
if (!is.null(BPPARAM)) {
num.threads <- BiocParallel::bpnworkers(BPPARAM)
}
new("DbscanParam",
eps=eps,
min.pts=as.integer(min.pts),
core.prop=as.double(core.prop),
chunk.size=as.integer(chunk.size),
num.threads=as.integer(num.threads),
BNPARAM=BNPARAM
)
}
#' @export
setMethod("show", "DbscanParam", function(object) {
callNextMethod()
cat(sprintf("eps: %s\n", if (is.null(object[["eps"]])) "default" else object[["eps"]]))
for (i in c("min.pts", "core.prop", "num.threads")) {
cat(sprintf("%s: %s\n", i, object[[i]]))
}
for (i in c("BNPARAM")) {
cat(sprintf("%s: %s\n", i, class(object[[i]])[1]))
}
})
setValidity2("DbscanParam", function(object) {
msg <- character(0)
if (!is.null(object[["eps"]])) {
msg <- c(msg, .check_positive_slots(object, "eps"))
}
msg <- c(msg, .check_positive_slots(object, c("min.pts", "core.prop", "num.threads")))
if (object[["core.prop"]] > 1) {
msg <- c(msg, "'core.prop' should not be greater than 1")
}
if (length(msg)) {
return(msg)
}
TRUE
})
#' @export
#' @rdname DbscanParam-class
setMethod("clusterRows", c("ANY", "DbscanParam"), function(x, BLUSPARAM, full=FALSE) {
out <- .DBSCAN(x,
eps=BLUSPARAM[["eps"]],
min.pts=BLUSPARAM[["min.pts"]],
core.prop=BLUSPARAM[["core.prop"]],
chunk.size=BLUSPARAM[["chunk.size"]],
BNPARAM=BLUSPARAM[["BNPARAM"]],
num.threads=BLUSPARAM[["num.threads"]]
)
clusters <- out$clusters
clusters[clusters==0] <- NA_integer_
clusters <- factor(clusters)
if (full) {
out$clusters <- NULL
list(clusters=clusters, objects=out)
} else {
clusters
}
})
#' @importFrom BiocNeighbors buildIndex findNeighbors queryKNN findDistance
#' @importFrom stats quantile
#' @importFrom igraph make_graph components
#' @importFrom utils head tail
.DBSCAN <- function(x, eps=NULL, min.pts=5, core.prop=0.5, chunk.size=1000, BNPARAM=KmknnParam(), num.threads=1) {
x <- as.matrix(x)
# Finding all core points, using a quick NN algorithm.
all.dist <- findDistance(x, k=min.pts, BNPARAM=BNPARAM, num.threads=num.threads)
if (is.null(eps)) {
eps <- quantile(all.dist, core.prop)
}
is.core <- all.dist <= eps
# Identifying core-core components.
n.core <- sum(is.core)
core.idx <- buildIndex(x[is.core,,drop=FALSE], BNPARAM=BNPARAM)
remaining <- seq_len(n.core)
to <- vector("list", n.core)
while (length(remaining)) {
chosen <- head(remaining, chunk.size)
neighbors <- findNeighbors(BNINDEX=core.idx, subset=chosen, threshold=eps, get.distance=FALSE, num.threads=num.threads)$index
to[chosen] <- neighbors
remaining <- setdiff(tail(remaining, -chunk.size), unlist(neighbors))
}
from <- rep(seq_along(to), lengths(to))
to <- unlist(to)
g <- make_graph(rbind(from, to), n=n.core, directed=FALSE)
core.clusters <- components(g)$membership
# Finding closest core point (within range) for all non-core points.
clusters <- integer(nrow(x))
clusters[is.core] <- core.clusters
if (n.core) {
closest.core <- queryKNN(core.idx, query=x[!is.core,,drop=FALSE], num.threads=num.threads, k=1)
in.range <- closest.core$distance <= eps
clusters[which(!is.core)[in.range]] <- core.clusters[closest.core$index[in.range]]
}
list(clusters=clusters, eps=eps, min.pts=min.pts)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.