R/Clust_Method.R
In boclust: A Clustering Method Based on Boosting on Single Attributes

Documented in AssignLeftClust BossaSimi ClustMerge ClustShare KeyFeature OrderClust OverlapClust TestMergeClust

#' Bossa Similarity
#'
#' Calculate the similarity matrix of Bossa scores which are obtained by boosting
#' on single attribute.
#'
#' @param data A data.frame or matrix(\code{n*p}) of original categorical data.
#' @param is.pca A logical variable indicating if the Bossa scores should transformed
#' to principle components and then calculate the similarity matrix. It is recommended
#' when processing the ultra-dimension data.
#' @param pca.sum.prop A numeric indicating how many components should be reserved
#' in order to make this proportion of variance. The default is \code{pca.sum.prop =  0.95}.
#' @param n.comp The number of components of PCA. The default is \code{n.comp = 50}.
#' @param fix.pca.comp A numeric variable indicating whether choosing the fixed
#' number of components or the fixed proportion of variance and the default is to
#' choose fixed proportion.
#' @param alpha A power scaling for Bossa scores, representing the weight of
#' variable sigma value.
#' @param pro.show A logical indicator whether show the details of the process.
#' @name BossaSimi
#' @examples {
#' ## generate sparse data from the toy model of CIDR
#' sparse.data <- data.frame(g.1 = c(0, 5, 0, 6, 8, 6, 7, 7), g.2 = c(5, 0, 0, 0, 5, 7, 5, 7))
#'
#' ## with low-dimensional data, pca is uncessary
#' bossa.change <- BossaSimi(sparse.data, is.pca = FALSE)
#'
#' ## data after normalization
#' data.after <- bossa.change$U.score.non.pca
#'
#' ## similarity matrix of normalized data
#' data.simi <- bossa.change$bossa.simi
#' }
#' @return An object including Bossa scores, Bossa dissimilarity and Bossa similarity(for
#' \code{\link{OverlapClust}}.
#' @export

BossaSimi <- function(data, is.pca = TRUE, pca.sum.prop = 0.95, fix.pca.comp = FALSE,
                      n.comp = 50, alpha = 1, pro.show = FALSE) {
  if (!is.data.frame(data)) {
    if (is.matrix(data)) {
      data <- as.data.frame(data)
    }
  }
  delete.ind <- which(apply(data, 2, sum) == 0)
  if (length(delete.ind) > 0)  data <- data[, -c(delete.ind)]

  n <- dim(data)[1]
  p <- dim(data)[2]

  if (!all(apply(data, 2, sum) != 0))
    warning("The input data contains 'All Zero' gene which should be deleted.")

  if (!all(complete.cases(data)))
    warning("The input data contains missing value.")
  data <- na.omit(data)

  # if (!all(data == as.integer(data))) warning('the input data should be
  # binary or odinary')
  if(pro.show) cat("Do transformation...\n")
  U.score <- apply(data, 2, FUN = function(x) {
    expres <- rle(sort(x))
    expres.level <- expres$values
    prop.level <- expres$lengths/n
    n.level <- length(expres.level)
    P <- prop.level[-n.level]
    q <- qnorm(cumsum(P))
    e <- -diff(c(0, exp(-q^2/2), 0))/sqrt(2 * pi)/prop.level
    U.score.x <- vector(length = n)
    for (j in 1:n.level) {
      U.score.x[x == expres.level[j]] <- round(e[j],2)
    }

    return(U.score.x)
  })

  U.score.non.pca <- U.score

  # Do pca to get small dimension version of U.score
  if (is.pca) {
    data.pca <- prcomp(U.score)
    if(pro.show) cat("done")

    data.pca.sdev <- data.pca$sdev
    var.prop <- data.pca.sdev^2/sum(data.pca.sdev^2)
    var.prop.sum <- cumsum(var.prop)
    sum.prop.index <- which(var.prop.sum > pca.sum.prop)[1]

    if (!fix.pca.comp) {
      if(pro.show) cat(paste("need", sum.prop.index, "components to get", pca.sum.prop,
                             "% variance"))
      data.pca.x <- data.pca$x[, 1:sum.prop.index]
    } else {
      if (n.comp > n | n.comp > p)
        warning("The proposed number of components is more than the rank of data.",
                call. = FALSE)
      data.pca.x <- data.pca$x[, 1:min(n.comp, n, p)]
    }
    U.score <- data.pca.x
  }

  U.score.scale <- apply(U.score, 2, FUN = function(x) x * var(x)^(alpha/2))
  bossa.simi <- U.score.scale %*% cor(U.score) %*% t(U.score.scale)
  max <- max(bossa.simi)
  min <- min(bossa.simi)
  diff <- max - min
  bossa.disimi <- (max - bossa.simi)/(max - min)

  U.score.non.pca <- as.data.frame(U.score.non.pca)

  row.names(U.score.non.pca) <- row.names(data)
  colnames(U.score.non.pca) <- colnames(data)

  row.names(bossa.simi) <- row.names(data)
  colnames(bossa.simi) <- row.names(data)

  return(list(U.score.non.pca = U.score.non.pca, bossa.simi = bossa.simi,
              bossa.disimi = bossa.disimi))
}



#' Overlap Clustering
#'
#' Do overlap clustering with Bossa similarity in different levels of \code{p}.
#'
#' @param data.simi The similarity matrix of Bossa scores obtained by \code{\link{BossaSimi}}
#' @param p A set of quantiles(90%, 75% and median) of the positive values of
#' similarity matrix to form clusters at different levels of within-cluster similarity.
#' @param lin A tuning parameter to control the size of each overlap cluster before
#' merging, smaller lin leads to larger cluster size.
#' @param pro.show A logical indicator whether show the details of the process.
#' @import stats
#' @return A list including two data.frame: overlap sub-clusters and cluster center for each.
#' @examples {
#' data(bo.simu.data)
#'
#' ## calculate the similarity matrix
#' bossa.simi <- BossaSimi(bo.simu.data)$bossa.simi
#'
#' overlap.clust <- OverlapClust(bossa.simi)
#' }
#' @export

OverlapClust <- function(data.simi, p = c(0.9, 0.75, 0.5), lin = 0.25, pro.show = FALSE) {
  n <- dim(data.simi)[1]
  if(pro.show) cat("Do overlap clustering for", n, "observations...")
  overlap.clu <- cbind(first.clu = rep(0, n), belong.layer = rep(0, n))
  n.clu <- 1
  clust <- list()

  simi.tri <- data.simi[lower.tri(data.simi)]
  core.thresh <- quantile(simi.tri[simi.tri > 0], p)
  self <- diag(data.simi)
  clust.center <- data.frame()

  for (l in 1:length(p)) {
    thresh <- core.thresh[l]
    jump0 <- 0

    # Look for the critical centers ------------------------

    while (jump0 == 0) {
      max.var.idx <- which.max(self)[1]
      if(n.clu == 0) n.clu = 1
      clust[[n.clu]] <- max.var.idx

      candidt <- sort(data.simi[max.var.idx, ], decreasing = T, index.return = T)
      candidt.idx <- (1:n)[candidt$ix[candidt$x > thresh]]

      if(length(candidt.idx) > 1) {
        for (i in candidt.idx) {
          if (i == max.var.idx)
            next
          candidt.simi <- data.simi[i, c(unlist(clust[[n.clu]]), i)]
          candidt.simi.check <- candidt.simi[1]  # distance between i and the center

          if (lin < 0 || lin >= 1)
            warning("The parameter 'lin' is not appropriate.")

          candidt.simi.check <- ifelse(lin > 0 & lin < 1, quantile(candidt.simi,
                                                                   lin), ifelse(lin == 0, min(candidt.simi), candidt.simi.check))


          if (candidt.simi.check > thresh)
            clust[[n.clu]] <- append(clust[[n.clu]], i)
        }
      }


      sum.new.idx <- sum(overlap.clu[unlist(clust[[n.clu]]), 1] == 0)
      if(pro.show) cat(paste(sum.new.idx, " new points are assigned to the current cluster, sum to ",
                             length(clust[[n.clu]]), " cells.\n", sep = ""))

      if (sum.new.idx > 1 & n.clu <= 50) {

        self[unlist(clust[[n.clu]])] <- 0

        clust.new <- ifelse(1:n %in% unlist(clust[[n.clu]]), 1,
                            0)
        overlap.clu <- cbind(overlap.clu, clust.new)

        change.position.1 <- as.logical(clust.new) & overlap.clu[,
                                                                 1] == 0
        overlap.clu[change.position.1, 1] <- n.clu
        change.position.2 <- as.logical(clust.new) & overlap.clu[,
                                                                 2] == 0
        overlap.clu[change.position.2, 2] <- l

        clust.center <- rbind(clust.center, c(l, max.var.idx))
        n.clu <- n.clu + 1

      } else {
        jump0 <- 1
        clust <- clust[-n.clu]
        if (n.clu == 1)
          n.clu <- 0
      }
    }

    # Consider the non-neighbours of max variance points ------------------

    if (n.clu == 0)
      next

    sum.layer.0 <- sum(overlap.clu[, 2] == l)
    if(sum.layer.0 > 0) {
      candidt.idx.2 <- which(overlap.clu[, 2] != l)
      candidt.clu <- which(clust.center[, 1] == l)

      for (i in candidt.idx.2) {
        clu.simi <- sapply(candidt.clu, FUN = function(j) {
          candidt.simi <- data.simi[i, unlist(clust[[j]])]
          candidt.simi.check <- max(candidt.simi)

          candidt.simi.check <- ifelse(lin > 0 & lin < 1,
                                       quantile(candidt.simi, lin),
                                       ifelse(lin == 0,
                                              min(candidt.simi),
                                              candidt.simi.check))
          return(candidt.simi.check)
        })

        max.simi <- max(clu.simi)

        if (max.simi > thresh) {
          max.clu.idx <- candidt.clu[which.max(clu.simi)]
          clust[[max.clu.idx]] <- append(clust[[max.clu.idx]], i)
          if (overlap.clu[i, 1] == 0)
            overlap.clu[i, 1] <- max.clu.idx
          if (overlap.clu[i, 2] == 0)
            overlap.clu[i, 2] <- l
          overlap.clu[i, 2 + max.clu.idx] <- 1
        }
      }
    }

    sum.layer.1 <- sum(overlap.clu[, 2] == l)
    new <- sum.layer.1 - sum.layer.0

    if (l > 1)
      if(pro.show) cat(paste("After second search, there are", new, "new cells are assigned.\n\n", sep = " "))
  }

  sum.clu <- dim(overlap.clu)[2] - 2
  colnames(overlap.clu) <- c("first.clu", "belong.layer",
                             paste("clust.", 1:sum.clu, sep = ""))

  return(list(overlap.clu = overlap.clu, clust.center = clust.center))

}


#' Calculate the share matrix among overlap clusters.
#'
#' @param overlap.clu The result of overlap clustering.
#' @keywords internal
ClustShare <- function(overlap.clu) {
  m <- dim(overlap.clu)[2]
  share.mat <- matrix(0, m, m)
  for (i in 1:(m - 1)) {
    for (j in i:m) {
      share.mat[i, j] <- share.mat[j, i] <- sum(overlap.clu[, i] == 1 & overlap.clu[, j] ==
                                                  1)
    }
  }
  diag(share.mat) <- colSums(overlap.clu)
  return(share.mat)
}


#' Do a test before merging two overlap clusters.
#'
#' Do a test before merging two overlap clusters. Pairwise comparisons are processed
#' to show if those two clusters should be merged.
#' @keywords internal
#' @param data A data.frame with two columns of two overlap sub-clusters.
#' @param r A numeric indicating tetrachoric correlation coefficient.

TestMergeClust <- function(data, r) {
  n <- dim(data)[1]
  P <- list()
  for (i in 1:2) {
    dat <- data[, i]
    rlei <- rle(sort(dat))
    labeli <- rlei$values
    pri <- rlei$lengths/n
    leni <- length(labeli)
    P[[i]] <- pri[-leni]
  }
  p1 <- P[[1]]
  p2 <- P[[2]]
  K1 <- length(p1) + 1
  K2 <- length(p2) + 1
  q1 <- qnorm(cumsum(p1))
  q2 <- qnorm(cumsum(p2))
  e1 <- -diff(c(0, exp(-q1^2/2), 0))/sqrt(2 * pi)/c(p1, 1 - sum(p1))
  e2 <- -diff(c(0, exp(-q2^2/2), 0))/sqrt(2 * pi)/c(p2, 1 - sum(p2))
  U1 <- e1[data[, 1] + 1]
  U2 <- e2[data[, 2] + 1]
  stat <- sum(U1 * U2)
  Low <- c(-Inf, q1)
  Up <- c(q1, Inf)
  q2l <- c(-Inf, q2)
  q2u <- c(q2, Inf)
  cvs <- 0
  cv <- 0

  for (i in 1:K1) {

    f1 <- function(x) {
      dnorm(x) * pnorm((q2u[1] - r * x)/sqrt(1 - r^2))
    }
    If1 <- integrate(f1, lower = Low[i], upper = Up[i])$value
    cv <- cv + If1 * e1[i] * e2[1]
    cvs <- cvs + If1 * e1[i]^2 * e2[1]^2

    fk2 <- function(x) {
      dnorm(x) * pnorm((r * x - q2l[K2])/sqrt(1 - r^2))
    }
    Ifk2 <- integrate(fk2, lower = Low[i], upper = Up[i])$value
    cv <- cv + Ifk2 * e1[i] * e2[K2]
    cvs <- cvs + Ifk2 * e1[i]^2 * e2[K2]^2

    if (K2 > 2) {
      for (j in 2:(K2 - 1)) {
        f <- function(x) {
          dnorm(x) * (pnorm((q2u[j] - r * x)/sqrt(1 - r^2)) - pnorm((q2l[j] -
                                                                       r * x)/sqrt(1 - r^2)))
        }
        If <- integrate(f, lower = Low[i], upper = Up[i])$value
        cv <- cv + If * e1[i] * e2[j]
        cvs <- cvs + If * e1[i]^2 * e2[j]^2
      }
    }
  }

  vu <- n * (cvs - cv^2)
  sstat <- stat/sqrt(vu)
  pval <- 2 * (1 - pnorm(abs(sstat)))
  return(list(stat = sstat, pval = pval))
}


#' Determine key features of each overlap clusters.
#' @param cdata Overlap clustering result.
#' @param cri A tuning parameter, if p value smaller than cri, then reject
#' the NULL hypothesis and merge overlap sub-clusters. And cri can be any numeric less
#' than \code{1}, if \code{cri = 1} then the criteria will be reset to \code{0.05/N}
#' (N is the number of all overlap sub-cluster), and if \code{cri = 2} then the
#' criteria \code{0.05/N(N-1)}.
#' @keywords internal
#' @importFrom psych tetrachoric
#' @return A list including the recommend parameter to cut trees in HC \code{scrit0, scrit1},
#' the separate overlap sub-clusters \code{sep.clu}, and highly similar sub-clusters \code{mer.clu},
#' statistics for merging \code{clu.math.stat}

KeyFeature <- function(cdata, cri, sum.clu) {
  cdata <- cdata[, -c(1, 2)]

  criter <- ifelse(cri < 1,
                   cri, ifelse(cri == 1,
                               0.05/sum.clu,
                               0.05/sum.clu/(sum.clu - 1)))
  n <- dim(cdata)[1]
  m <- dim(cdata)[2]

  cmax <- max(apply(cdata, 2, max))
  r <- psych::tetrachoric(cdata, smooth = FALSE)$rho

  Stats <- matrix(0, m, m)
  Pval <- matrix(1, m, m)

  for (i in 1:(m - 1)) {
    for (j in (i + 1):m) {
      # coding change to 0:k-------------
      c1 <- cdata[, i]
      uc1 <- unique(c1)
      sc1 <- sort(uc1)
      for (c1i in sc1) {
        c1[c1 == c1i] <- (1:length(uc1))[sc1 == c1i] - 1
      }
      c2 <- cdata[, j]
      uc2 <- unique(c2)
      sc2 <- sort(uc2)
      for (c2i in sc2) {
        c2[c2 == c2i] <- (1:length(uc2))[sc2 == c2i] - 1
      }
      tij <- TestMergeClust(cbind(c1, c2), r[i, j])
      Stats[i, j] <- tij$stat
      Pval[i, j] <- tij$pval
    }
  }

  Sp <- Stats
  Sp[Pval > criter] <- 0

  Pp <- Pval
  Pp[Pval > criter] <- 1
  Pp[Sp < 0] <- 1

  psortp <- sort(Pp, index.return = T)
  featindp <- psortp$ix[psortp$x < 1]
  flenp <- length(featindp)

  if (flenp == 0) {
    kfeatp <- 0
  } else {
    kfeatp <- rep(0, flenp)
    for (ii in 1:flenp) {
      fii <- floor(featindp[ii]/m)
      fij <- featindp[ii] - fii * m
      kfeatp[ii] <- paste(colnames(cdata)[fij],
                          colnames(cdata)[fii + 1], sep = ", ")
    }
  }

  Sn <- Stats
  Sn[Pval > criter] <- 0

  Pn <- Pval
  Pn[Pval > criter] <- 1
  Pn[Sn > 0] <- 1

  psortn <- sort(Pn, index.return = T)
  featindn <- psortn$ix[psortn$x < 1]
  flenn <- length(featindn)
  if (flenn == 0) {
    kfeatn <- 0
  } else {
    kfeatn <- rep(0, flenn)
    for (ii in 1:flenn) {
      fii <- floor(featindn[ii]/m)
      fij <- featindn[ii] - fii * m
      kfeatn[ii] <- paste(colnames(cdata)[fij],
                          colnames(cdata)[fii + 1], sep = ",")
    }
  }

  stat <- Stats
  stat <- stat + t(stat) - diag(diag(stat))
  maxs <- max(stat)
  mins <- min(stat)
  crit <- qnorm(criter)
  stat <- (maxs - stat)/(maxs - mins)
  scrit0 <- (maxs + crit)/(maxs - mins)
  scrit1 <- (maxs - crit)/(maxs - mins)

  return(list(mer.clu = kfeatp, sep.clu = kfeatn, clu.math.stat = stat, scrit0 = scrit0,
              scrit1 = scrit1))
}

#' Assign the left points
#'
#' Assign the left points to the cluster which is of largest similarity
#'
#' @param overlap.clu The label of each overlap clusters.
#' @param sum.clu The number of overlap clusters
#' @param data.simi Similarity matrix
#' @param n The number of observations
#' @import stats
#' @keywords internal


AssignLeftClust <- function(overlap.clu, sum.clu, data.simi, n = n){
  non.core.ind <- (1:n)[apply(overlap.clu[, -c(1, 2)], 1, sum) == 0]
  if(length(non.core.ind) > 0){
    for (i in non.core.ind) {
      maxci <- rep(0, sum.clu)
      ij <- 0
      for (j in 1:sum.clu) {
        ij <- ij + 1
        maxci[ij] <- quantile(data.simi[i, overlap.clu[, 1] == j],
                              0.5)
      }
      max.ind <- which.max(maxci)

      overlap.clu[i, 1] <- max.ind
      overlap.clu[i, (max.ind + 2)] <- 1
    }
  }
  return(overlap.clu)
}


#' Clust Merge
#'
#' With every recommended k, merge the overlap clusters depending on the distance
#' between those.
#' @param tree.size The label of each overlap clusters.
#' @keywords internal

ClustMerge <- function(tree.size, clu.hc, n, overlap.clu, data.simi)
  {
  k <- max(tree.size)
  tree.merge.index <- cutree(clu.hc, k = k)
  new.overlap.clu <- data.frame(cell = rownames(data.simi))

  for (clu in 1:k) {
    clu.ind <- which(tree.merge.index == clu)
    new.overlap <- overlap.clu[, (clu.ind[1] + 2)]

    for (j in clu.ind) {
      new.overlap <- (overlap.clu[, j + 2] | new.overlap)
    }

    new.overlap.clu <- cbind(new.overlap.clu, new.overlap)
  }
  colnames(new.overlap.clu) <- c("cell", paste("merge.clu", 1:k, sep = ""))
  for (i in 1:dim(new.overlap.clu)[1]) {
    for (j in 2:dim(new.overlap.clu)[2]) {
      new.overlap.clu[i, j] <- as.numeric(new.overlap.clu[i, j])
    }
  }
  return(new.overlap.clu)
}


#' Reindex cluster labels in ascending order
#'
#' Given an \code{\link[stats]{hclust}} object and the number of clusters \code{k}
#' this function reindex the clusters inferred by \code{cutree(hc, k)[hc$order]}, so that
#' they appear in ascending order. This is particularly useful when plotting
#' heatmaps in which the clusters should be numbered from left to right.
#'
#' @param hc an object of class hclust
#' @param k number of cluster to be inferred from hc
#'
#' @keywords internal

OrderClust <- function(hc, k) {
  clusts <- cutree(hc, k)
  labels <- names(clusts)
  names(clusts) <- 1:length(clusts)
  ordering <- clusts[hc$order]
  new.index <- NULL
  j <- 1
  for (i in unique(ordering)) {
    tmp <- rep(j, length(ordering[ordering == i]))
    names(tmp) <- names(ordering[ordering == i])
    new.index <- c(new.index, tmp)
    j <- j + 1
  }
  clusts <- new.index
  clusts <- clusts[order(as.numeric(names(clusts)))]
  names(clusts) <- labels
  return(clusts)
}



OverlapNote <- function(overlap.clu){
  K <- length(overlap.clu)
  overlap.note <- list()

  for(j in 1:K){
    overlap <- overlap.clu[[j]]
    k.sub <- dim(overlap)[2] - 1
    names <- colnames(overlap)
    cell.sum <- apply(overlap[,-1], 2, sum)


    sub.cell <- sapply(1:k.sub, function(x){
      cell.vec <- overlap$cell[overlap[,1+x] == 1]
      cell.name <- cell.vec[1]

      for(i in 2:length(cell.vec)){
        cell.name <- paste(cell.name, cell.vec[i], sep = ", ")
      }
      return(cell.name)
    })

    new <- paste("There are ",cell.sum, " cells in ",
                 names[-1], ":  ", sub.cell, sep = "")

    sub.cell.res <- character()

    for(i in 1:k.sub){
      sub.cell.res <- paste(new[i], "\n\n\n", sub.cell.res, sep = "")
    }

    overlap.note[[j]] <- sub.cell.res
  }

  return(overlap.note)

}
Any scripts or data that you put into this service are public.
boclust documentation built on Dec. 4, 2017, 9:04 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
boclust
A Clustering Method Based on Boosting on Single Attributes

R/Clust_Method.R
In boclust: A Clustering Method Based on Boosting on Single Attributes

Defines functions BossaSimi OverlapClust ClustShare TestMergeClust KeyFeature AssignLeftClust ClustMerge OrderClust OverlapNote

Documented in AssignLeftClust BossaSimi ClustMerge ClustShare KeyFeature OrderClust OverlapClust TestMergeClust

Try the boclust package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

boclust A Clustering Method Based on Boosting on Single Attributes

R/Clust_Method.R In boclust: A Clustering Method Based on Boosting on Single Attributes

Defines functions BossaSimi OverlapClust ClustShare TestMergeClust KeyFeature AssignLeftClust ClustMerge OrderClust OverlapNote

Documented in AssignLeftClust BossaSimi ClustMerge ClustShare KeyFeature OrderClust OverlapClust TestMergeClust

Try the boclust package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

boclust
A Clustering Method Based on Boosting on Single Attributes

R/Clust_Method.R
In boclust: A Clustering Method Based on Boosting on Single Attributes