scClassify: scClassify: single-cell Hierarchical Classification

Documented in plotCellTypeTree runHOPACH

#' Create HOPACH tree
#'
#' A function generating HOPACH tree using the average expression matrix for
#' each cell type.
#'
#' @param data A matrix of average expression matrix
#'  (each row indicates the gene, each column indicates the cell type)
#' @param plot Indicate whether plot or not
#' @param kmax Integer between 1 and 9 specifying the maximum number of children
#' at each node in the tree.
#' @return Return a \code{list} where
#' \itemize{
#' \item{cutree_list}: A list indicates the hierarchical cell type tree
#' \item{plot}: A \code{ggplot} visualise the cell type tree
#' }
#' @author Yingxin Lin
#' @importFrom stats cutree median lm
#' @importFrom hopach vectmatrix hdist is.hdist improveordering distancevector
#' as.hdist
#' @importFrom methods as is
#' @importFrom cluster pam
#' @importFrom graphics plot text
#' @import statmod
#'
#' @examples
#'
#' data("scClassify_example")
#' wang_cellTypes <- factor(scClassify_example$wang_cellTypes)
#' exprsMat_wang_subset <- scClassify_example$exprsMat_wang_subset
#' avgMat_wang <- apply(exprsMat_wang_subset, 1, function(x)
#' aggregate(x, list(wang_cellTypes), mean)$x)
#' rownames(avgMat_wang) <- levels(wang_cellTypes)
#' res_hopach <- runHOPACH(avgMat_wang)
#' res_hopach$plot
#'
#' @references van der Laan, M. J. and Pollard, K. S. (2003)
#' ‘A new algorithm for hybrid hierarchical clustering with
#' visualization and the bootstrap’,
#' Journal of Statistical Planning and Inference.
#' doi: 10.1016/S0378-3758(02)00388-9.
#' @export

runHOPACH <- function(data, plot= TRUE, kmax = 5) {

    # Compute the distance matrix using correlation
    dist <- distancematrix(data, d = "cor")

    # Run HOPACH function
    clustresult <- hopach(data, dmat = dist, kmax = kmax)


    # Rearrange HOPACH results
    final_labels <- strsplit(as.character(format(clustresult$final$labels,
                                                 scientific = FALSE)), "")

    # Get each level seperate
    level_list <- list()
    for (i in seq_len(nchar(clustresult$final$labels[1]))) {
        if (i != 1) {
            level_list[[i]] <- paste(level_list[[i - 1]],
                                     unlist(lapply(final_labels, "[[", i)),
                                     sep = "")
        }else{
            level_list[[i]] <- unlist(lapply(final_labels, "[[", i))
        }
    }



    # Generate cutree results
    cutree_list <- list()

    # First level (All ones)
    cutree_list[[1]] <- rep(1, length(rownames(data)))
    names(cutree_list[[1]]) <- rownames(data)

    cutree_list[2:(length(level_list) + 1)] <- lapply(level_list, function(x) {
        x <- as.numeric(as.factor(x))
        names(x) <- rownames(data)
        x
    })

    # Last levels will be all distinct numbers
    cutree_list[[length(cutree_list) + 1]] <-
        seq_len(length(clustresult$final$labels))
    names(cutree_list[[length(cutree_list)]]) <- rownames(data)

    # To plot using ggraph
    if (plot) {

        treePlot <- plotCellTypeTree(cutree_list)
        return(list(cutree_list = cutree_list, plot = treePlot))

    }else{
        return(list(cutree_list = cutree_list))
    }

}

#' To plot cell type tree
#'
#'
#' @param cutree_list A list indicates the hierarchical cell type tree
#' @param group_level Indicate whether plot or not
#'
#' @return A ggplot object visualising the HOPACH tree
#'
#' @importFrom ggraph geom_edge_diagonal geom_node_text geom_node_point ggraph
#' @import igraph
#'
#' @examples
#'
#' data("trainClassExample_xin")
#'
#' plotCellTypeTree(cellTypeTree(trainClassExample_xin))
#'
#' @export


plotCellTypeTree <- function(cutree_list, group_level = NULL) {

    cutree_list <- cutree_list

    if (is.null(group_level)) {
        group_level <- 3
    }

    E_list <- list()
    for (i in seq_len(length(cutree_list))) {
        if (i != 1) {
            parent <- cutree_list[[i - 1]]
            E_list[[i]] <- paste(paste(i - 1, parent, sep = ""),
                                 paste(i, cutree_list[[i]], sep = ""),
                                 sep = "_")
        }
    }

    V_list <- list()
    for (i in seq_len(length(cutree_list))) {
        V_list[[i]] <- paste(i, cutree_list[[i]], sep = "")
        names(V_list[[i]]) <- names(cutree_list[[i]])
    }


    g <- igraph::make_graph(unlist(strsplit(unique(unlist(lapply(E_list,
                                                                 sort))),
                                            "_")))



    ### Which level is used to color the node
    group_level <- group_level
    group_col <- (V_list[[group_level]])
    names(group_col) <- V_list[[length(cutree_list)]]


    V_group <- rep(NA, length(igraph::V(g)))
    names(V_group) <- names(V(g))
    V_group[V_list[[length(V_list)]]] <- group_col[V_list[[length(V_list)]]]
    V(g)$group <- V_group  # note only the bottom level are coloured


    ### Next, get the label for the bottom level

    V_name <- rep(NA, length(igraph::V(g)))
    names(V_name) <- names(V(g))
    V_name[V_list[[length(V_list)]]] <- names(V_list[[length(V_list)]])
    V(g)$name <- V_name




    treePlot <- ggraph::ggraph(g, layout = 'dendrogram') +
        ggraph::geom_edge_diagonal(colour = "black") +
        ggraph::geom_node_text(aes(vjust = 1, hjust = -0.2, label = name),
                               size = 2.7, alpha = 1) +
        ggraph::geom_node_point(aes(filter = leaf, alpha = 0.2, size = 3,
                                    colour = as.factor(V(g)$group))) +
        ggplot2::theme_void() +
        ggplot2::scale_colour_hue() +
        ggplot2::theme(legend.position = "none") +
        ggplot2::coord_flip() +
        NULL
    treePlot
}



#Hierarchical Ordered Partitioning and Collapsing Hybrid (HOPACH)#
#The codes here are originated from HOPACH pacakge with small edits
#
#Reference: ‘A new algorithm for hybrid hierarchical clustering with
# visualization and the bootstrap’,
# Journal of Statistical Planning and Inference.
# doi: 10.1016/S0378-3758(02)00388-9.

labelstomss <- function(labels, dist, khigh = 9, within = "med",
                        between = "med", hierarchical = TRUE) {
    if (!is.vector(labels))
        stop("First arg to labelstomss() must be a vector")

    p = methods::slot(dist, "Size")
    if (length(labels) != p)
        stop("Distance matrix and labels dimensions
             do not agree in labelstomss()")

    unlabels <- sort(unique(labels))
    k <- length(unlabels)
    ss <- NULL
    for (i in seq_len(k)) {
        labs <- (seq_len(p))[labels == unlabels[i]]
        pp <- length(labs)
        if (pp < 3)
            ss[i] <- NA
        else{
            dissvec <- methods::slot(dist[labs,labs], "Data")
            bestk <- silcheck(dissvec,min(khigh,(length(labs) - 1),
                                          na.rm = TRUE), diss = TRUE)[1]
            if (within == "med")
                ss[i] <- median(pam(dissvec, bestk,
                                    diss = TRUE)$silinfo$widths[, 3])
            if (within == "mean")
                ss[i] <- pam(dissvec,bestk,diss = TRUE)$silinfo$avg.width
        }
    }
    if (sum(is.na(ss)) == k)
        out <- NA
    else{
        if (hierarchical == TRUE) {
            parentlab <- trunc(labels/10)
            unparentlab <- sort(unique(parentlab))
            pk <- length(unparentlab)
            pss <- NULL
            for (i in seq_len(pk)) {
                if (between == "med")
                    pss[i] <- median(ss[trunc(unlabels/10) == unparentlab[i]],
                                     na.rm = TRUE)
                if (between == "mean")
                    pss[i] <- mean(ss[trunc(unlabels/10) == unparentlab[i]],
                                   na.rm = TRUE)
            }
            if (between == "med")
                out <- median(pss,na.rm = TRUE)
            if (between == "mean")
                out <- mean(pss,na.rm = TRUE)
        }
        else{
            if (between == "med")
                out <- median(ss,na.rm = TRUE)
            if (between == "mean")
                out <- mean(ss,na.rm = TRUE)
        }
    }
    return(out)
}


silcheck <- function(data, kmax=9, diss=FALSE, echo=FALSE, graph=FALSE)
{
    if ( !diss ) {
        if ( inherits(data, "dist"))
            stop("data argument is a dist object, but diss is FALSE")
        if ( is.matrix(data) && (nrow(data)  ==  ncol(data) ) )
            warning("data argument is square, could be a dissimilarity")
    }
    if ( diss && is.matrix(data) && nrow(data) != ncol(data) )
        stop("should be a dissimilarity matrix - but is not square")

    sil <- NULL
    m <- min(kmax, max((!diss)*(dim(data)[1] - 1),
                       (diss)*(0.5*(sqrt(1 + 8*length(data)) - 1)),
                       na.rm = TRUE))
    if (m < 2)
        out <- c(1, NA)
    else{
        for (i in seq_len((m - 1)))
            sil[i] <- pam(data, k = (i + 1), diss = diss)$silinfo$avg.width
        if (echo)
            cat("best k = ", order(sil)[length(sil)] + 1, ", sil(k) = ",
                round(max(sil),4), "\n")
        if (graph) {
            plot(2:m, sil, type = "n", xlab = "Number of Clusters",
                 ylab = "Average Silhouette")
            text(2:m, sil, 2:m)
        }
        out <- c(order(sil)[length(sil)] + 1,max(sil))
    }
    return(out)
}

#msscheck (mss)
msscheck <- function(dist, kmax=9, khigh=9, within="med", between="med",
                     force=FALSE, echo=FALSE, graph=FALSE) {
    p = methods::slot(dist, "Size")
    if (p < 3)
        out <- c(1,NA)
    else{
        dvec <- methods::slot(dist, "Data")

        kmax <- min(kmax,p - 1,na.rm = TRUE)
        if (force)
            mss <- 0
        else
            mss <- labelstomss(rep(1,p),dist,khigh,within,between)
        for (k in 2:kmax)
            mss[k] <- labelstomss(pam(dvec, k, diss = TRUE)$clust, dist,
                                  khigh, within, between)
        shift <- 0
        if (force) {
            mss <- mss[-1]
            shift <- 1
        }
        if (echo)
            cat("best k = ", order(mss)[1] + shift, ", mss(k) = ",
                round(min(mss),4), "\n")
        if (graph) {
            kmin <- ifelse(force,2,1)
            plot(kmin:kmax, mss, type = "n" ,xlab = "Number of Clusters",
                 ylab = "MSS")
            text(kmin:kmax,mss,kmin:kmax)
        }
        out <- c(order(mss)[1] + shift,min(mss))
    }
    return(out)
}

#2. Utility functions

#counts the number of digits in label
digits <- function(label) {
    label <- label[1]
    count <- 0
    while (label >= 1) {
        count <- count + 1
        label <- label/10
    }
    return(count)
}


# patrick's suggestion...
cutdigits <- function(labels, dig) {
    df <- max(0,digits(labels) - dig)
    return(trunc(labels/(10^df)))
}

#removes trailing zeros from labels
cutzeros <- function(labels) {
    for (i in seq_len(length(labels))) {
        while (trunc(labels[i]/10)*10 == labels[i]) {
            labels[i] <- labels[i]/10
        }
    }
    return(labels)
}

#returns the number of non-zero digits in labels
nonzeros <- function(labels) {
    for (i in seq_len(length(labels))) {
        while (trunc(labels[i]/10)*10 == labels[i]) {
            labels[i] <- labels[i]/10
        }
        labels[i] <- digits(labels[i])
    }
    return(labels)
}


msssplitcluster <- function(clust1, l1, id1, medoid1,
                            med2dist, right, dist1,
                            kmax = 9, khigh = 9,
                            within = "med", between = "med") {
    if (!medoid1)
        warning("Medoid missing - continue to split cluster")
    else{
        if (sum(medoid1 == id1) == 0 & medoid1)
            warning("Medoid not in cluster - continue to split cluster")
    }
    if (is.matrix(clust1)) {
        p1 <- length(clust1[,1])
        n <- length(clust1[1,])
    }
    else p1 <- 1
    if (p1 < 3)
        k1 <- 1
    else{
        l <- length(clust1[,1])
        dissvec <- methods::slot(dist1, "Data")

        kmax <- min(p1 - 1,kmax,na.rm = TRUE)
        khigh <- min(p1 - 1,khigh,na.rm = TRUE)
        k1 <- msscheck(dist1, kmax, khigh, within, between)[1]
        if (k1 > 1) {
            pamobj <- pam(dissvec, k1, diss = TRUE)
            newclussizes <- pamobj$clusinfo[,1]
            newmedoids1 <- id1[pamobj$medoids]
            newlabels1 <- pamobj$clustering
            distnewmedoids <- NULL
            for (j in seq_len(k1))
                distnewmedoids[j] <-
                mean(med2dist[newlabels1 == newlabels1[pamobj$medoids[j]]])
            if (right == 1)
                #ord <- order(distnewmedoids, decreasing = TRUE)
                ord <- rev(order(distnewmedoids))
            else
                ord <- order(distnewmedoids)
            newmedoids1 <- newmedoids1[ord]
            newclussizes <- newclussizes[ord]
            oldlab <- newlabels1
            for (j in seq_len(k1))
                newlabels1[oldlab == ord[j]] <- j
            newlabels1 <- rep(10*l1,l) + newlabels1
        }
    }
    if (k1 == 1) {
        newmedoids1 <- medoid1
        newlabels1 <- rep(10*l1,p1)
        newclussizes <- p1
    }
    for (a in seq_len(length(newmedoids1))) {
        if (sum(newmedoids1[a] == id1) == 0)
            warning("Problem with new medoids after splitting cluster")
    }
    list(k1,newmedoids1,newlabels1,newclussizes)
}



mssnextlevel <- function(data ,prevlevel, dmat, kmax = 9,
                         khigh = 9, within = "med", between = "med") {
    n <- length(data[1,])
    p <- length(data[,1])

    id <- seq_len(p)
    k <- prevlevel[[1]]
    medoids <- prevlevel[[2]]
    labels <- prevlevel[[4]]
    newk <- 0
    newlabels <- newmedoids <- newclussizes <- NULL
    count <- 1
    ordlabels <- sort(unique(labels))
    if (length(ordlabels) != k)
        warning("Number of unique labels not equal
                number of clusters in mssnextlevel()")
    if (sum(is.na(medoids))) {
        warning("Missing values in medoid vector in nextlevel()")
        medoids[is.na(medoids)] <- FALSE
    }
    if (length(unique(medoids)) < k && sum(medoids))
        warning("Medoids not unique in mssnextlevel()")
    checkmeans <- FALSE
    if (length(medoids) == 1 && !medoids) {
        warning("No medoids provided in mssnxtlevel()")
        usemean <- TRUE
    }
    else{
        if (sum(medoids > 1) == k)
            usemean <- FALSE
        else
            checkmeans <- TRUE
    }
    for (j in (seq_len(k))) {
        clust1 <- data[labels == ordlabels[j],]
        id1 <- id[labels == ordlabels[j]]
        if (length(id1) > 1)
            clust1 <- as.matrix(clust1)
        l1 <- ordlabels[j]
        right <- (j < k)
        medoid1 <- ifelse(is.na(medoids[j]),0,medoids[j])
        if (j < k)
            medoid2 <- medoids[j + 1]
        else
            medoid2 <- medoids[j - 1]
        if (length(id1) > 1) {
            med2dist <- rowMeans(as(dmat[labels == ordlabels[j],
                                         labels == labels[medoid2],
                                         drop = FALSE],"matrix"))
        }else{
            med2dist <- mean(dmat[labels == ordlabels[j],
                                  labels == labels[medoid2]])
        }

        splitobj <- msssplitcluster(clust1,l1,id1,medoid1,med2dist,right,
                                    dmat[labels == l1,labels == l1],
                                    kmax,khigh,within,between)
        newlabels[labels == ordlabels[j]] <- splitobj[[3]]
        k1 <- splitobj[[1]]
        start <- count
        end <- count + k1 - 1
        newmedoids[start:end] <- splitobj[[2]]
        newclussizes[start:end] <- splitobj[[4]]
        count <- count + k1
    }
    count <- newk <- count - 1
    newmedoids <- newmedoids[seq_len(newk)]
    newclussizes <- newclussizes[seq_len(newk)]
    final <- 0
    if (count == k)
        final <- 1
    if (max(newclussizes) == 3)
        final <- 1
    list(newk,newmedoids,newclussizes,newlabels,final,
         rbind(prevlevel[[6]],cbind(sort(unique(newlabels)),newmedoids)))
}

orderelements  <-  function(level, data, rel = "own",
                            d = "cosangle", dmat = NULL) {

    idn <- seq_len(length(data[,1]))
    k <- level[[1]]
    labels <- level[[4]]
    medoids <- level[[2]]
    clussizes <- level[[3]]
    ord <- order(labels)
    idnord <- idn[ord]
    subdataord <- data[ord,]

    if (is.null(dmat))
        dmat <- distancematrix(data,d)
    distord <- dmat[ord,]

    labelsord <- labels[ord]
    count <- 1
    for (j in (seq_len(k))) {
        start <- count
        end <- count + clussizes[j] - 1
        if (clussizes[j] > 2) {
            tempid <- idnord[start:end]
            if (rel == "co") {
                distj <- distord[,ord][start:end,start:end]
                idnord[start:end] <- tempid[improveordering(distj)]
            }
            else{
                if (rel == "neighbor") {
                    if (j < k)
                        mednext <- medoids[j + 1]
                    else
                        mednext <- medoids[j - 1]
                }else
                    mednext <- medoids[j]

                dmednext <- distord[start:end,mednext]

                if (rel == "neighbor") {
                    if (j < k)
                        #ordtemp <- order(dmednext, decreasing = TRUE)
                        ordtemp <- rev(order(dmednext))
                    else
                        ordtemp <- order(dmednext)
                }
                else
                    ordtemp <- order(dmednext)
                idnord[start:end] <- tempid[ordtemp]
            }
        }
        else
            idnord[start:end] <- idnord[start:end]
        count <- count + clussizes[j]
    }
    list(data[idnord,],idnord)
}



mssinitlevel <- function(data, kmax=9, khigh=9, d="cosangle", dmat=NULL,
                         within="med", between="med", ord="co",
                         verbose=FALSE)
{
    #print("mssinitlevel")
    if (!is.matrix(data))
        stop("First arg to mssinitlevel() must be a matrix")

    p <- length(data[,1])

    if (is.matrix(dmat))
        dmat <- as.hdist(dmat)

    else
        dmat <- distancematrix(data,d = d)

    if (methods::slot(dmat, "Size") != p)
        stop("Data and distance matrix dimensions
             do not agree in mssinitlevel()")

    m <- msscheck(dmat,kmax,khigh,within,between)
    if (m[1] == 1) {
        if (verbose)
            cat("No strong evidence for clusters in the first level -
          \n continuing to split root node anyway. \n")
        m <- msscheck(dmat,kmax,khigh,within,between,force = TRUE)
    }

    pamobj <- pam(methods::slot(dmat, "Data"), m[1], diss = TRUE)
    rowmedoids <- pamobj$medoids
    final <- ifelse(max(pamobj$clusinfo[,1]) < 3,1,0)
    if (m[1] > 2) {
        medoidsdata <- as.matrix(data[rowmedoids,])
        l <- length(rowmedoids)
        medoidsdist <- dmat[rowmedoids,rowmedoids]
        if (ord == "co")
            medoidsord <- improveordering(medoidsdist)
        if (ord == "clust") {
            mpamobj <- pam(methods::slot(medoidsdist, "Size"), 2, diss = TRUE)
            labelsmed <- mpamobj$clustering
            medmed <- mpamobj$medoids
            clussizes <- mpamobj$clusinfo[,1]

            prevlevel <- mssnextlevel(medoidsdata,list(2,medmed,clussizes,
                                                       labelsmed,0,
                                                       cbind(c(1,2),medmed)),
                                      dmat = medoidsdist,kmax,khigh,within,
                                      between)
            final <- prevlevel[[5]]
            if (final == 0) {
                depth <- (l - 1)
                for (j in seq_len(depth)) {
                    if (final == 0) {
                        clustnext <- mssnextlevel(medoidsdata,prevlevel,
                                                  dmat = medoidsdist,kmax,
                                                  khigh,within,between)
                        final <- clustnext[[5]]
                    }
                    if (final == 1) {
                        j <- depth
                        prevlevel <- clustnext
                    }
                }
            }
            medoidsord <- orderelements(prevlevel,medoidsdata,rel = "neighbor",
                                        d = d,dmat = medoidsdist)[[2]]
        }
        k <- m[1]
        rowmedoids <- rowmedoids[medoidsord]
        labels <- lab2 <- pamobj$clustering
        for (j in (seq_len(k)))
            lab2[labels == medoidsord[j]] <- j
        output <- list(k,rowmedoids,pamobj$clusinfo[,1][medoidsord],lab2,
                       final,cbind(seq_len(k),rowmedoids))
    }
    else
        output <- list(2,pamobj$medoids,pamobj$clusinfo[,1],
                       pamobj$clustering,final,cbind(seq_len(2),pamobj$medoids))
    return(output)
}


paircoll <- function(i,j,data,level,d="cosangle",
                     dmat=NULL, newmed="medsil") {
    p <- length(data[,1])
    k <- level[[1]]
    labels <- level[[4]]
    medoids <- level[[2]]
    clussizes <- level[[3]]
    N <- length(level[[6]][,1])
    block <- level[[6]][(N - k + 1):N,]
    if (N == k)
        prevblock <- NULL
    else
        prevblock <- level[[6]][seq_len(N - k),]
    if (max(i,j) > k)
        stop("The clusters to collapse do not exist in paircoll()")
    labeli <- labels[medoids[i]]
    labelj <- labels[medoids[j]]
    oldlabels <- labels
    labels[labels == labelj] <- labeli
    trunclabels <- trunc(oldlabels/10)
    labelparents <- unique(trunclabels)
    parenti <- order(labelparents)[labelparents == trunc(labeli/10)]
    parentj <- order(labelparents)[labelparents == trunc(labelj/10)]
    if (newmed == "nn")
        fakemed <- (data[medoids[i],]*clussizes[i] +
                        data[medoids[j],]*clussizes[j])/(clussizes[i] +
                                                             clussizes[j])
    if (newmed == "uwnn")
        fakemed <- (data[medoids[i],] + data[medoids[j],])/2
    if (newmed == "nn" || newmed == "uwnn") {
        rowsub <- (seq_len(p))[labels == labeli]
        distsfm <- distancevector(data[rowsub,],as.vector(fakemed),d)
        medoids[i] <- rowsub[order(distsfm)[1]]
    }
    else{
        #colldist <- as.matrix(dmat[labels == labeli,labels == labeli])
        colldist <- as(dmat[labels == labeli,labels == labeli],"matrix")

        rowsub <- (seq_len(p))[labels == labeli]
        if (newmed == "center") {
            sumdist <- rowSums(colldist)
        }
        if (newmed == "medsil") {
            othermed <- medoids[-c(i,j)]
            collp <- length(labels[labels == labeli])
            othern <- length(othermed)
            if (othern == 0)
                stop("Not enough medoids to use newmed='medsil' in paircoll()")
            if (is(dmat,"hdist")) {
                if (othern == 1) {
                    #otherdist <- cbind(dmat[labels == labeli,othermed])
                    otherdist <- dmat[labels == labeli,othermed]
                }else{
                    #otherdist <- rbind(dmat[labels == labeli,othermed])
                    otherdist <- dmat[labels == labeli,othermed]
                }
            }

            if (othern == 1)
                b <- otherdist
            else
                b <- apply(otherdist,1,min)
            b <- matrix(b,nrow=collp,ncol=collp)
            diag(b) <- 0
            b <- abs(b-colldist)/pmax(colldist,b)
            sumdist <- rowSums(b)
            medoids[i] <- rowsub[order(sumdist,decreasing=TRUE) == 1]
            #medoids[i] <- rowsub[rev(order(sumdist)) == 1]
        }
    }
    k <- k-1
    clussizes[i] <- clussizes[i] + clussizes[j]
    block[i,2] <- medoids[i]
    if (j<=k) {
        for (l in (j:k)) {
            medoids[l] <- medoids[l + 1]
            clussizes[l] <- clussizes[l + 1]
            block[l,] <- block[l + 1,]
        }
    }
    medoids <- medoids[seq_len(k)]
    clussizes <- clussizes[seq_len(k)]
    block <- block[seq_len(k),]
    if (labels[labels == labeli][1]/10 == trunclabels[labels == labeli][1]) {
        labels[labels == labeli] <- labels[labels == labeli] + 1
        labels[labels == labelj] <- labels[labels == labelj] + 1
        block[i,1] <- block[i,1] + 1
    }
    return(list(k,medoids,clussizes,labels,level[[5]],rbind(prevblock,block)))
}


collap <- function(data,level,d = "cosangle", dmat = NULL, newmed = "medsil") {
    k <- level[[1]]
    if (k < 3) {
        warning("Not enough medoids to use newmed='medsil' in collap() - \n
            using newmed='nn' instead \n")
        newmed <- "nn"
    }
    medoids <- level[[2]]
    clussizes <- level[[3]]
    if (sum(is.na(clussizes)))
        warning("NA in clussizes")
    medoidsdata <- data[medoids,]
    if (sum(is.na(medoidsdata)) > 0)
        warning("Missing value(s) in medoidsdata in collap()")

    distmed <- dmat[medoids,medoids]
    distv <- methods::slot(distmed, "Data")

    indexmin <- order(distv)[1]
    best <- vectmatrix(indexmin,k)
    clustfinal <- paircoll(best[1],best[2],data,level,d,dmat,newmed)
    return(clustfinal)
}

msscollap <- function(data,level,khigh,d="cosangle", dmat = NULL,
                      newmed = "medsil", within = "med",
                      between = "med", impr = 0) {

    newk <- level[[1]]
    mss1 <- labelstomss(level[[4]],dmat,khigh,within,between)
    maxncoll <- max(0,newk - 2)
    ncoll <- 0
    coll <- 1
    if (newk <= 2)
        coll <- 0
    while ((coll == 1) && (ncoll <= maxncoll)) {
        levelc <- collap(data,level,d,dmat,newmed)
        mss2 <- labelstomss(levelc[[4]],dmat,khigh,within,between)
        if (mss1 == 0)
            r <- 0
        else
            r <- (mss1 - mss2)/mss1
        if (r < impr)
            coll <- 0
        else{
            mss1 <- mss2
            level <- levelc
            ncoll <- ncoll + 1
        }
    }
    return(level)
}

mssmulticollap <- function(data,level,khigh,d="cosangle",
                           dmat = NULL, newmed = "medsil",
                           within = "med", between = "med", impr = 0) {
    if (!is.matrix(data))
        stop("First arg to mssmulticollap() must be a matrix")

    medoids <- level[[2]]
    medoidsdata <- data[medoids,]
    if (sum(is.na(medoidsdata)) > 0)
        warning("Missing value(s) in medoidsdata in mssmulticollap()")

    distmed <- dmat[medoids,medoids]
    k <- level[[1]]
    ord <- order(methods::slot(distmed, "Data"))

    mss1 <- labelstomss(level[[4]],dmat,khigh,within,between)
    maxncoll <- max(0,k*(k - 1)/2)
    ncoll <- 0
    i <- 1
    while (i <= maxncoll) {
        clusts <- vectmatrix(ord[i],k)
        if (k < 3) {
            if (newmed == "medsil")
                warning("Can't use newmed=medsil with less than 3 clusters. \n
                        Substituting newmed=nn")
            newmed <- "nn"
        }
        levelc <- paircoll(clusts[1],clusts[2],data,level,d,dmat,newmed)
        mss2 <- labelstomss(levelc[[4]],dmat,khigh,within,between)
        if (mss1 == 0)
            r <- 0
        else
            r <- (mss1 - mss2)/mss1
        if (r >= impr) {
            mss1 <- mss2
            level <- levelc
            ncoll <- ncoll + 1
            k <- level[[1]]
            maxncoll <- max(0,k*(k - 1)/2)
            i <- 0
            medoids <- level[[2]]
            medoidsdata <- data[medoids,]
            if (sum(is.na(medoidsdata)) > 0)
                warning("Missing value(s) in medoidsdata in mssmulticollap()")

            distmed <- dmat[medoids,medoids]
            ord <- order(methods::slot(distmed, "Data"))
        }
        i <- i + 1
    }
    return(level)
}


mssrundown <- function(data, K=16, kmax=9, khigh=9, d="cosangle",
                       dmat=NULL, initord="co", coll="seq",
                       newmed="medsil", stop=TRUE,
                       finish=FALSE, within="med",
                       between="med",impr=0, verbose=FALSE)
{
    #print("mssrundown")
    if (!is.matrix(data))
        stop("First arg to mssrundown() must be a matrix")

    bestlevel <- level <- mssinitlevel(data, kmax, khigh,
                                       d, dmat, within,
                                       between, initord, verbose)
    bestmss <- mss <- labelstomss(level[[4]],dmat,khigh,within,between)
    bestl <- l <- 1
    ind <- 0
    if (verbose)
        cat("Searching for main clusters... \n")
    if (level[[5]] == 1)
        return(level)
    while((l<=K) && (ind == 0)) {
        if (verbose) cat("Level ",l,"\n")
        if (coll == "seq")
            levelc <- msscollap(data,level,khigh,d,dmat,
                                newmed,within,between,impr)
        if (coll == "all")
            levelc <- mssmulticollap(data,level,khigh,d,
                                     dmat,newmed,within,between,impr)
        mss <- labelstomss(levelc[[4]],dmat,khigh,within,between)
        if (mss>=bestmss & stop == TRUE)
            ind <- 1
        else{
            if (mss<bestmss) {
                bestlevel <- levelc
                bestmss <- mss
                bestl <- l
            }
        }
        l <- l+1
        if (l<=K) {
            level <- mssnextlevel(data,levelc,dmat,kmax,khigh,within,between)
            if (finish == TRUE) {
                if (sum(trunc(level[[4]]/10)*10 == level[[4]]) ==
                    length(level[[4]]) & l<=K) {
                    ind <- 1
                    bestlevel <- levelc
                    bestmss <- mss
                    bestl <- (l-1)
                }
            }
        }
    }
    if (verbose)
        cat("Identified", bestlevel[[1]],
            " main clusters in level",
            bestl, "with MSS =",bestmss,"\n")
    return(bestlevel)
}

msscomplete <- function(level, data, K=16, khigh=9, d="cosangle",
                        dmat=NULL, within="med", between="med", verbose=FALSE)
{
    if (!is.matrix(data))
        stop("First arg to msscomplete() must be a matrix")

    count <- digits(level[[4]][1])
    if (verbose)
        cat("Running down without collapsing from Level",count,"\n")
    while ((max(level[[3]]) > 3) & (count < K)) {
        level <- newnextlevel(data,level,dmat,2,khigh)
        count <- count + 1
        if (verbose) cat("Level",count,"\n")
    }
    return(level)
}



newnextlevel <- function(data, prevlevel, dmat, klow=2, khigh=6) {
    #print("newnextlevel")
    if (!is.matrix(data))
        stop("First arg to newnextlevel() must be a matrix")
    p <- length(data[,1])
    n <- length(data[1,])
    id <- seq_len(p)
    k <- prevlevel[[1]]
    medoids <- prevlevel[[2]]
    labels <- prevlevel[[4]]
    newk <- 0
    newlabels <- newmedoids <- newclussizes <- NULL
    count <- 1
    ordlabels <- sort(unique(labels))
    if (length(ordlabels) != k)
        warning("Number of unique labels not equal
                number of clusters in newnextlevel()")
    if (sum(is.na(medoids))) {
        warning("Missing value(s) in medoid vector in newnextlevel()")
        medoids[is.na(medoids)] <- FALSE
    }
    if (length(unique(medoids)) < k && sum(medoids))
        warning("Medoids in newnextlevel() are not unique")
    checkmeans <- FALSE
    if (length(medoids) == 1 && !medoids) {
        warning("No medoids provided in newnextlevel()")
        usemean <- TRUE
    }
    else{
        if (sum(medoids > 1) == k)
            usemean <- FALSE
        else
            checkmeans <- TRUE
    }
    for (j in (seq_len(k))) {
        clust1 <- data[labels == ordlabels[j],]
        id1 <- id[labels == ordlabels[j]]
        if (length(id1) > 1) {
            kmax <- min(c(khigh,dim(clust1)[1] - 1))
            clust1 <- as.matrix(clust1)
        }
        l1 <- ordlabels[j]
        right <- (j < k)
        medoid1 <- ifelse(is.na(medoids[j]),0,medoids[j])
        if (j < k)
            medoid2 <- medoids[j + 1]
        else
            medoid2 <- medoids[j - 1]
        if (length(id1) > 1)
            med2dist <- rowMeans(as(dmat[labels == ordlabels[j],
                                         labels == labels[medoid2],
                                         drop = FALSE],"matrix"))
        else
            med2dist <- mean(dmat[labels == ordlabels[j],
                                  labels == labels[medoid2]])

        splitobj <- newsplitcluster(clust1,l1,id1,klow,kmax,medoid1,med2dist,
                                    right,dmat[labels == l1,labels == l1])
        newlabels[labels == ordlabels[j]] <- splitobj[[3]]
        k1 <- splitobj[[1]]
        start <- count
        end <- count + k1 - 1
        newmedoids[start:end] <- splitobj[[2]]
        newclussizes[start:end] <- splitobj[[4]]
        count <- count + k1 - 1 + 1
    }
    count <- count - 1
    newk <- count
    newmedoids <- newmedoids[seq_len(newk)]
    newclussizes <- newclussizes[seq_len(newk)]
    final <- 0
    if (count == k)
        final <- 1
    if (max(newclussizes) == 3)
        final <- 1
    return(list(newk,newmedoids,newclussizes,newlabels,
                final,rbind(rbind(prevlevel[[6]],
                                  cbind(sort(unique(newlabels)),newmedoids)))))
}

newsplitcluster <- function(clust1, l1, id1, klow=2, khigh=2, medoid1,
                            med2dist, right, dist1) {
    #print("newsplitcluter")
    if (!medoid1)
        warning("Medoid missing - continue to split cluster")
    else{
        if (sum(medoid1 == id1) == 0 & medoid1)
            warning("Medoid not in cluster - continue to split cluster")
    }
    if (is.matrix(clust1)) {
        p1 <- length(clust1[,1])
        n <- length(clust1[1,])
    }
    else p1 <- 1
    if (p1 < 3) {
        k1 <- 1
        newmedoids1 <- medoid1
        newlabels1 <- rep(10*l1,p1)
        newclussizes <- p1
    }
    else{
        l <- length(clust1[,1])
        dissvec <- methods::slot(dist1, "Data")

        kmax <- min(p1 - 1, khigh)
        a <- rep(0,(kmax - klow + 2))
        best <- 2
        for (j in (klow:kmax)) {
            a[j] <- pam(dissvec,j,diss = TRUE)$silinfo$avg.width
            if (a[j] > a[best]) best <- j
        }
        k1 <- best
        pamobj <- pam(dissvec, k1, diss = TRUE)
        newclussizes <- pamobj$clusinfo[,1]
        newmedoids1 <- id1[pamobj$medoids]
        newlabels1 <- pamobj$clustering
        distnewmedoids <- NULL
        for (j in seq_len(k1))
            distnewmedoids[j] <-
            mean(med2dist[newlabels1 ==
                              newlabels1[pamobj$medoids[j]]])
        if (right == 1)
            ord <- order(distnewmedoids, decreasing = TRUE)
        #ord <- rev(order(distnewmedoids))
        else
            ord <- order(distnewmedoids)
        newmedoids1 <- newmedoids1[ord]
        newclussizes <- newclussizes[ord]
        oldlab <- newlabels1
        for (j in seq_len(k1))
            newlabels1[oldlab == ord[j]] <- j
        newlabels1 <- rep(10*l1,l) + newlabels1
    }
    for (a in seq_len(length(newmedoids1))) {
        if (sum(newmedoids1[a] == id1) == 0)
            warning("Problem with new medoids after splitting cluster")
    }
    return(list(k1,newmedoids1,newlabels1,newclussizes))
}



hopach <- function(data, dmat=NULL, d="cosangle", clusters="best", K=15,
                   kmax=9, khigh=9, coll="seq", newmed="medsil",
                   mss="med", impr=0, initord="co", ord="own", verbose=FALSE) {
    # if (inherits(data,"ExpressionSet"))
    #   data  <-  exprs(data)
    data <- as.matrix(data)

    # Convert to hdist immediately #
    if ( is.null(dmat) ) {
        dmat <- distancematrix(data, d)
    }else if ( is.matrix(dmat) ) {
        dmat <- as(dmat, "hdist")
    }else if ( "dist" %in% is(dmat)) {
        dmat <- hdist(Data = as.numeric(dmat), Size = attr(dmat, "Size"),
                      Labels = seq_len(attr(dmat, "Size")),
                      Call = as.character(attr(dmat, "call"))[3])
    }else if (!is.hdist(dmat)) {
        stop("Distance matrix could not be created into hdist object.")
    }

    if (K > 15) {
        K <- 15
        warning("K set to 15 - can't do more than 15 splits")
    }
    if (K < 1) {
        K <- 1
        warning("K set to 1 - can't do less than 1 level")
    }
    if (clusters != "none") {
        cuttree <- mssrundown(data, K, kmax, khigh, d, dmat,
                              initord, coll, newmed,
                              stop = (clusters == "greedy"), finish = TRUE,
                              within = mss,
                              between = mss, impr, verbose)
        if (cuttree[[1]] > 1)
            cutord <- orderelements(cuttree, data, rel = ord, d, dmat)[[2]]
        else
            cutord <- NULL
        out1 <- list(k = cuttree[[1]], medoids = cuttree[[2]],
                     sizes = cuttree[[3]], labels = cuttree[[4]],
                     order = cutord)
        finaltree <- msscomplete(cuttree, data, K, khigh, d,
                                 dmat, within = mss, between = mss,
                                 verbose)
    }
    else{
        out1 <- NULL
        finaltree <- msscomplete(mssinitlevel(as.matrix(data),
                                              kmax, khigh, d, dmat,
                                              within = mss, between = mss,
                                              initord), data, K, khigh,
                                 d, dmat, within = mss,
                                 between = mss, verbose)
    }
    dimnames(finaltree[[6]]) <- list(NULL,c("label","medoid"))
    out2 <- list(labels = finaltree[[4]],
                 order = orderelements(finaltree, data, rel = ord, d,
                                       dmat)[[2]], medoids = finaltree[[6]])
    return(list(clustering = out1, final = out2, call = match.call(),
                metric = d))
}

distancematrix  <-  function(X, d, na.rm = TRUE)
{
    X  <-  as.matrix(X)
    if (d == "euclid")
        return(hopach::disseuclid(X, na.rm))
    if (d == "cor")
        return(hopach::disscor(X, na.rm))
    if (d == "abscor")
        return(hopach::dissabscor(X, na.rm))
    if (d == "cosangle")
        return(hopach::disscosangle(X, na.rm))
    if (d == "abscosangle")
        return(hopach::dissabscosangle(X, na.rm))
    stop("Distance metric ", d, " not available")
}
SydneyBioX/scClassify documentation built on Oct. 22, 2021, 4:03 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
SydneyBioX/scClassify
scClassify: single-cell Hierarchical Classification

R/runHOPACH.R
In SydneyBioX/scClassify: scClassify: single-cell Hierarchical Classification

Defines functions distancematrix hopach newsplitcluster newnextlevel msscomplete mssrundown mssmulticollap msscollap collap paircoll mssinitlevel orderelements mssnextlevel msssplitcluster nonzeros cutzeros cutdigits digits msscheck silcheck labelstomss plotCellTypeTree runHOPACH

Documented in plotCellTypeTree runHOPACH

R Package Documentation

Browse R Packages

We want your feedback!

SydneyBioX/scClassify scClassify: single-cell Hierarchical Classification

R/runHOPACH.R In SydneyBioX/scClassify: scClassify: single-cell Hierarchical Classification

Defines functions distancematrix hopach newsplitcluster newnextlevel msscomplete mssrundown mssmulticollap msscollap collap paircoll mssinitlevel orderelements mssnextlevel msssplitcluster nonzeros cutzeros cutdigits digits msscheck silcheck labelstomss plotCellTypeTree runHOPACH

Documented in plotCellTypeTree runHOPACH

R Package Documentation

Browse R Packages

We want your feedback!

SydneyBioX/scClassify
scClassify: single-cell Hierarchical Classification

R/runHOPACH.R
In SydneyBioX/scClassify: scClassify: single-cell Hierarchical Classification