R/tuneCluster.spls.R

Defines functions tuneCluster.spls

Documented in tuneCluster.spls

#' Feature Selection Optimization for sPLS method 
#' 
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::spls} by optimizing the silhouette coefficient, which assesses the quality of clustering.
#' 
#' @param X numeric matrix (or data.frame) with features in columns and samples in rows
#' @param Y numeric matrix (or data.frame) with features in columns and samples in rows (same rows as \code{X})
#' @param ncomp integer, number of component to include in the model
#' @param test.keepX vector of integer containing the different value of keepX to test for block \code{X}.
#' @param test.keepY vector of integer containing the different value of keepY to test for block \code{Y}.
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::spls})
#' 
#' @return 
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY}
#' \item{ncomp}{number of component included in the model}
#' \item{test.keepX}{list of tested keepX}
#' \item{test.keepY}{list of tested keepY}
#' \item{block}{names of blocks}
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY}
#' \item{choice.keepX}{best \code{keepX} for each component}
#' \item{choice.keepY}{best \code{keepY} for each component}
#' 
#'
#' @details
#' For each component and for each keepX/keepY value, a spls is done from these parameters.
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering.
#'
#' We then calculate "slopes" where keepX/keepY are the coordinates and the silhouette is the intensity.
#' A z-score is assigned to each slope.
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering.
#'
#' 
#' @seealso 
#' \code{\link[mixOmics]{spls}}, \code{\link[timeOmics]{getCluster}}, \code{\link[timeOmics]{plotLong}}
#'
#' @examples
#' demo <- suppressWarnings(get_demo_cluster())
#' X <- demo$X
#' Y <- demo$Y
#' 
#' # tuning
#' tune.spls <- tuneCluster.spls(X, Y, ncomp= 2, test.keepX= c(5,10,15,20), test.keepY= c(2,4,6))
#' keepX <- tune.spls$choice.keepX
#' keepY <- tune.spls$choice.keepY
#' 
#' # final model
#' spls.res <- mixOmics::spls(X, Y, ncomp= 2, keepX= keepX, keepY= keepY)
#' 
#' # get clusters and plot longitudinal profile by cluster
#' spls.cluster <- getCluster(spls.res)
#' plotLong(spls.res)
#' 
#'
#' @export
#' @import mixOmics
#' @importFrom dplyr left_join
#' @importFrom dplyr mutate
#' @importFrom dplyr filter

tuneCluster.spls <- function(X, Y, ncomp = 2, test.keepX = rep(ncol(X), ncomp),
                             test.keepY = rep(ncol(Y), ncomp), ...){
    #-- checking input parameters ------------  ---------------------------------#
    #--------------------------------------------------------------------------#

    #-- X
    X <- validate_matrix_X(X)
    Y <- validate_matrix_Y(Y)

    #-- ncomp
    ncomp <- validate_ncomp(ncomp, list(X,Y))

    #-- test.keepX
    test.keepX <- validate_test_keepX(test.keepX = test.keepX, X = X)

    #-- test.keepY
    test.keepY <- validate_test_keepY(test.keepY = test.keepY, Y = Y)

    list.keepX.keepY <- list("keepX" = test.keepX, "keepY" = test.keepY) %>%
        expand.grid(stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE)


    #-- launch tuning  --------------------------------------------------------#
    #--------------------------------------------------------------------------#

    #-- 0. set output object
    result <- as.data.frame(matrix(ncol = 5, nrow = nrow(list.keepX.keepY)*ncomp))
    colnames(result) <- c("comp", "X", "Y", "pos", "neg")
    result.index <- 0

    #--1. compute dissimilarity matrix for silhouette coef. (once and for all)
    all_data <- cbind(X, Y)
    dmatrix <- dmatrix.spearman.dissimilarity(all_data)

    cluster <- as.data.frame(list("feature" = rownames(dmatrix),
                                  "block" = c(rep("X", ncol(X)), rep("Y", ncol(Y)))))

    #--2. tuning
    for(comp in 1:ncomp){
        for(index.list.kX.kY in 1:nrow(list.keepX.keepY)){
            # foreach comp, keepX and keepY of other comp is set to minimum
            kX <- rep(min(test.keepX), ncomp)
            kY <- rep(min(test.keepY), ncomp)
            kX[comp] <- list.keepX.keepY[index.list.kX.kY,"keepX"]
            kY[comp] <- list.keepX.keepY[index.list.kX.kY,"keepY"]

            #--3. run spls
            spls.res <- mixOmics::spls(X = X, Y = Y, ncomp =  ncomp, keepX = kX, keepY = kY, ...)

            #--4. extract clusters
            tmp.cluster <- getCluster(spls.res)
            tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)],
                                                             by = c("feature"="molecule"))) %>%
                dplyr::mutate(cluster = as.numeric(as.character(cluster))) %>%
                dplyr::mutate(cluster = ifelse(is.na(cluster), 0, cluster))

            #--5. compute silhouette
            sil <- silhouette(dmatrix, tmp.cluster$cluster)

            #--6. store
            result.index <- result.index + 1
            result[result.index, "comp"] <- comp
            result[result.index, "X"] <- kX[comp]
            result[result.index, "Y"] <- kY[comp]
            # result[result.index, "pos"] <- sil$average.cluster  %>%
            #     dplyr::filter(cluster == comp) %>% pull(silhouette.coef)
            # result[result.index, "neg"] <- sil$average.cluster  %>%
            #     dplyr::filter(cluster == -comp) %>% pull(silhouette.coef)
            pos.res <-  sil$average.cluster  %>% 
                dplyr::filter(cluster == comp) %>%
                dplyr::pull(silhouette.coef)
            result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res)
            neg.res <-  sil$average.cluster  %>%
                dplyr::filter(cluster == -comp) %>%
                dplyr::pull(silhouette.coef)
            result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res)
        }
    }
    result <- list("silhouette" = result)
    result[["ncomp"]] <- ncomp
    result[["test.keepX"]] <- test.keepX
    result[["test.keepY"]] <- test.keepY
    result[["block"]] <- c("X", "Y")
    class(result) <- "spls.tune.silhouette"
    
    #-- 7. choice.keepX / choice.keepY
    result[["slopes"]] <- tune.silhouette.get_slopes(result)
    tmp <- tune.silhouette.get_choice_keepX(result) # choice keepX/keepY
    result[["choice.keepX"]] <- unlist(lapply(tmp, function(x) x$X))
    result[["choice.keepY"]] <- unlist(lapply(tmp, function(x) x$Y))
    return(result)
}

Try the timeOmics package in your browser

Any scripts or data that you put into this service are public.

timeOmics documentation built on Nov. 8, 2020, 10:58 p.m.