R/2_buildSOM.R

Defines functions GetPercentages GetCounts GroupStats GetFeatures GetClusterCVs GetClusterPercentagesPositive GetClusterMFIs GetMetaclusters GetClusters SaveClustersToFCS Dist.MST Purity Initialize_PCA Initialize_KWSP UpdateDerivedValues BuildSOM

Documented in BuildSOM Dist.MST GetClusterCVs GetClusterMFIs GetClusterPercentagesPositive GetClusters GetCounts GetFeatures GetMetaclusters GetPercentages GroupStats Initialize_KWSP Initialize_PCA Purity SaveClustersToFCS

#' Build a self-organizing map
#' 
#' Build a SOM based on the data contained in the FlowSOM object
#'
#' @param fsom      FlowSOM object containing the data, as constructed by 
#'                  the \code{\link{ReadInput}} function
#' @param colsToUse Markers, channels or indices to use for building the SOM
#' @param silent    if \code{TRUE}, no progress updates will be printed
#' @param outlierMAD Number of MAD when a cell is considered an outlier.
#'                   See also \code{\link{TestOutliers}}
#' @param ...       options to pass on to the SOM function (xdim, ydim, rlen, 
#'                  mst, alpha, radius, init, distf, importance)
#'
#' @return FlowSOM object containing the SOM result, which can be used as input
#'         for the \code{\link{BuildMST}} function
#'         
#' @seealso \code{\link{ReadInput}}, \code{\link{BuildMST}}
#' 
#' @references This code is strongly based on the \code{kohonen} package.
#'             R. Wehrens and L.M.C. Buydens, Self- and Super-organising Maps 
#'             in R: the kohonen package J. Stat. Softw., 21(5), 2007
#' 
#' @examples
#' 
#' # Read from file
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' flowSOM.res <- ReadInput(fileName, compensate = TRUE, transform = TRUE,
#'                          scale = TRUE)
#' 
#' # Build the Self-Organizing Map
#' # E.g. with gridsize 5x5, presenting the dataset 20 times, 
#' # no use of MST in neighborhood calculations in between
#' flowSOM.res <- BuildSOM(flowSOM.res, colsToUse = c(9, 12, 14:18),
#'                         xdim = 5, ydim = 5, rlen = 20)
#' 
#' # Build the minimal spanning tree and apply metaclustering
#' flowSOM.res <- BuildMST(flowSOM.res)
#' metacl <- MetaClustering(flowSOM.res$map$codes,
#'                          "metaClustering_consensus", max = 10)
#' 
#' @export
BuildSOM <- function(fsom, 
                     colsToUse = NULL, 
                     silent = FALSE, 
                     outlierMAD = 4,
                     ...){
  if(!"data" %in% names(fsom)){
    stop("Please run the ReadInput function first!")
  }
  
  if(!silent) message("Building SOM\n")
  
  if(is.null(colsToUse)){
    colsToUse <- seq_len(ncol(fsom$data))
  }
  
  colsToUse <- GetChannels(fsom, colsToUse)
  
  fsom$map <- SOM(fsom$data[, colsToUse], silent = silent, ...)
  fsom$map$colsUsed <- colsToUse
  fsom <- UpdateDerivedValues(fsom)
  
  fsom$outliers <- TestOutliers(fsom,
                                madAllowed = outlierMAD)
  return(fsom)
}

UpdateDerivedValues <- function(fsom){
  fsom$map$medianValues <-
    t(sapply(seq_len(fsom$map$nNodes), function(i) {
      apply(subset(fsom$data, fsom$map$mapping[, 1] == i), 
            2, 
            stats::median)
    }))
  fsom$map$medianValues[is.nan(fsom$map$medianValues)] <- NA 
  colnames(fsom$map$medianValues) <- colnames(fsom$data)
  
  fsom$map$cvValues <-
    t(sapply(seq_len(fsom$map$nNodes), function(i) {
      clusterSubset <- subset(fsom$data, fsom$map$mapping[, 1] == i)
      sapply(colnames(clusterSubset), function(colY){
        y <- clusterSubset[, colY]
              if (any(is.na(y))) {
                stop(paste0("NA value found in cluster ", i, " channel ", 
                            colY, "."))}
              if(length(y) > 0 && mean(y) != 0){
                stats::sd(y)/mean(y)
              } else {
                NA
              }})
    }))
  fsom$map$cvValues[is.nan(fsom$map$cvValues)] <- NA 
  colnames(fsom$map$medianValues) <- colnames(fsom$data)
  
  fsom$map$sdValues <-
    t(sapply(seq_len(fsom$map$nNodes), function(i) {
      apply(subset(fsom$data, fsom$map$mapping[, 1] == i), 2, stats::sd)
    }))
  fsom$map$sdValues[is.nan(fsom$map$sdValues)] <- 0 
  colnames(fsom$map$sdValues) <- colnames(fsom$data)
  
  fsom$map$madValues <-
    t(sapply(seq_len(fsom$map$nNodes), function(i) {
      apply(subset(fsom$data, fsom$map$mapping[, 1] == i), 2, stats::mad)
    }))
  fsom$map$madValues[is.nan(fsom$map$madValues)] <- 0 
  colnames(fsom$map$madValues) <- colnames(fsom$data)
  
  pctgs <- rep(0, fsom$map$nNodes)
  names(pctgs) <- as.character(seq_len(fsom$map$nNodes))
  pctgs_tmp <- table(fsom$map$mapping[, 1]) / nrow(fsom$map$mapping)
  pctgs[names(pctgs_tmp)] <- pctgs_tmp
  fsom$map$pctgs <- pctgs
  
  
  if(!is.null(fsom$metaclustering)){
    fsom$map$metaclusterMFIs <- 
      data.frame(fsom$data, 
                 mcl = fsom$metaclustering[fsom$map$mapping[, 1]],
                 check.names = FALSE) %>% 
      dplyr::group_by(.data$mcl, .drop = FALSE) %>% 
      dplyr::summarise_all(stats::median) %>% 
      dplyr::select(-.data$mcl) %>% 
      data.frame(row.names = levels(fsom$metaclustering),
                 check.names = FALSE)
  }
  
  return(fsom)
}

#' Build a self-organizing map
#'
#' @param data  Matrix containing the training data
#' @param xdim  Width of the grid
#' @param ydim  Hight of the grid
#' @param rlen  Number of times to loop over the training data for each MST
#' @param mst   Number of times to build an MST
#' @param alpha Start and end learning rate
#' @param radius Start and end radius
#' @param init  Initialize cluster centers in a non-random way
#' @param initf Use the given initialization function if init == T
#'              (default: Initialize_KWSP)
#' @param distf Distance function (1 = manhattan, 2 = euclidean, 3 = chebyshev, 
#'              4 = cosine)
#' @param silent If FALSE, print status updates
#' @param map If FALSE, data is not mapped to the SOM. Default TRUE.
#' @param codes Cluster centers to start with
#' @param importance array with numeric values. Parameters will be scaled 
#'                   according to importance
#'
#' @return A list containing all parameter settings and results
#'         
#' @seealso \code{\link{BuildSOM}}
#' 
#' @references This code is strongly based on the \code{kohonen} package.
#'             R. Wehrens and L.M.C. Buydens, Self- and Super-organising Maps 
#'             in R: the kohonen package J. Stat. Softw., 21(5), 2007
#' @useDynLib FlowSOM, .registration = TRUE
#' @export

SOM <- function (data, xdim = 10, ydim = 10, rlen = 10, mst = 1, 
                 alpha = c(0.05, 0.01),
                 radius = stats::quantile(nhbrdist, 0.67) * c(1, 0), 
                 init = FALSE, initf = Initialize_KWSP, distf = 2, 
                 silent = FALSE, map = TRUE,
                 codes = NULL, importance = NULL){
  if (!is.null(codes)){
    if((ncol(codes) != ncol(data)) | (nrow(codes) != xdim * ydim)){
      stop("If codes is not NULL, it should have the same number of 
             columns as the data and the number of rows should correspond with 
             xdim*ydim")
    }
  }
  
  if(!is.null(importance)){
    data <- data * rep(importance, each = nrow(data))
  }
  
  if (is.null(colnames(data))) {
    colnames(data) <- as.character(seq_len(ncol(data)))
  }
  # Initialize the grid
  grid <- expand.grid(seq_len(xdim), seq_len(ydim))
  nCodes <- nrow(grid)
  if(is.null(codes)){
    if(init){
      codes <- initf(data, xdim, ydim)
      message("Initialization ready\n")
    } else {
      codes <- data[sample(1:nrow(data), nCodes, replace = FALSE), , 
                    drop = FALSE]
    }
  }
  
  # Initialize the neighborhood
  nhbrdist <- as.matrix(stats::dist(grid, method = "maximum"))
  
  # Initialize the radius
  if(mst == 1){
    radius <- list(radius)
    alpha <- list(alpha)
  } else {
    radius <- seq(radius[1], radius[2], length.out = mst+1)
    radius <- lapply(1:mst, function(i){c(radius[i], radius[i+1])})
    alpha <- seq(alpha[1], alpha[2], length.out = mst+1)
    alpha <- lapply(1:mst, function(i){c(alpha[i], alpha[i+1])})
  }
  
  # Compute the SOM
  for(i in seq_len(mst)){
    res <- .C("C_SOM", data = as.double(data), 
              codes = as.double(codes), 
              nhbrdist = as.double(nhbrdist), 
              alpha = as.double(alpha[[i]]), 
              radius = as.double(radius[[i]]), 
              xdists = double(nCodes), 
              n = as.integer(nrow(data)), 
              px = as.integer(ncol(data)), 
              ncodes = as.integer(nCodes), 
              rlen = as.integer(rlen), 
              distf = as.integer(distf))
    
    codes <- matrix(res$codes, nrow(codes), ncol(codes))
    colnames(codes) <- colnames(data)
    nhbrdist <- Dist.MST(codes)
  }
  
  if(!silent) message("Mapping data to SOM\n")

  if (map) {
    mapping <- MapDataToCodes(codes, data)
  } else {
    mapping <- NULL
  }
  
  return(list(xdim = xdim, ydim = ydim, rlen = rlen, mst = mst, alpha = alpha,
              radius = radius, init = init, distf = distf,
              grid = grid, codes = codes, mapping = mapping, nNodes = nCodes))
}



#' Assign nearest node to each datapoint
#
#' @param codes matrix with nodes of the SOM
#' @param newdata datapoints to assign
#' @param distf Distance function (1 = manhattan, 2 = euclidean, 3 = chebyshev, 
#'              4 = cosine)
#' 
#' @return Array with nearest node id for each datapoint
#' 
MapDataToCodes <- function (codes, newdata, distf = 2) {
  
  nnCodes <- .C("C_mapDataToCodes", 
                as.double(newdata[, colnames(codes)]), 
                as.double(codes),
                as.integer(nrow(codes)),
                as.integer(nrow(newdata)),
                as.integer(ncol(codes)),
                nnCodes = integer(nrow(newdata)),
                nnDists = double(nrow(newdata)), 
                distf = as.integer(distf))
  return(cbind(nnCodes$nnCodes, nnCodes$nnDists))
}

#' Select k well spread points from X
#' @param   X matrix in which each row represents a point
#' @param   xdim x dimension of the grid
#' @param   ydim y dimension of the grid
#'
#' @return  array containing the selected selected rows
#' 
#' @examples 
#' 
#' points <- matrix(1:1000, ncol = 10)
#' selection <- Initialize_KWSP(points, 3, 3)
#' 
#' @export
Initialize_KWSP <- function(X, xdim, ydim){
  k <- xdim * ydim
  
  # Start with a random point
  selected <- numeric(k)
  selected[1] <- sample(1:nrow(X), 1)
  dists <- apply(X, 1, function(x)sum((x - X[selected[1], ]) ^ 2))
  
  for(i in seq_len(k - 1) + 1){ #2:k
    # Add point wich is furthest away from all previously selected points
    selected[i] <- which.max(dists)
    # Update distances
    dists <- pmin(dists, 
                  apply(X, 1, function(x)sum((x - X[selected[i], ]) ^ 2)))
  }
  
  return(X[selected, ])
}

#' Create a grid from first 2 PCA components
#' @param   data matrix in which each row represents a point
#' @param   xdim x dimension of the grid
#' @param   ydim y dimension of the grid
#'
#' @return  array containing the selected selected rows
#' 
#' @examples 
#' 
#' points <- matrix(1:1000, ncol = 10)
#' selection <- Initialize_PCA(points, 3, 3)
#' 
#' @export
Initialize_PCA <- function(data, xdim, ydim){
  pca <- stats::prcomp(data, rank. = 2, retx = FALSE)
  # scale out to 5-times standard deviation, 
  # which should cover the data nicely
  sdev_scale <- 5 
  ax1 <- t(matrix(pca$rotation[, 1] * sdev_scale * pca$sdev,
                  nrow = ncol(data),
                  ncol = xdim * ydim)) *
    (2 * rep(c(1:xdim) - 1, times = ydim) / (xdim - 1) - 1)
  ax2 <- t(matrix(pca$rotation[, 2] * sdev_scale * pca$sdev,
                  nrow = ncol(data),
                  ncol = xdim * ydim)) *
    (2 * rep(c(1:ydim) - 1, each = xdim) / (ydim - 1) - 1)
  
  return(t(matrix(pca$center, 
                  nrow = ncol(data), 
                  ncol = xdim * ydim)) + 
           ax1 + ax2)
}

#' Calculate mean weighted cluster purity
#'
#' @param realClusters      array with real cluster values
#' @param predictedClusters array with predicted cluster values
#' @param weighted          logical. Should the mean be weighted
#'                          depending on the number of points in the predicted 
#'                          clusters
#'                           
#' @return Mean purity score, worst score, number of clusters with score < 0.75
#' @examples
#' # Generate some random data as an example
#' realClusters <- sample(1:5, 100, replace = TRUE)
#' predictedClusters <- sample(1:6, 100, replace = TRUE)
#' 
#' # Calculate the FMeasure
#' Purity(realClusters, predictedClusters)
#' @export
Purity <- function(realClusters, predictedClusters, weighted = TRUE){
  
  t <- table(predictedClusters, realClusters)
  maxPercentages <- apply(t/rowSums(t), 1, max)
  
  if(weighted)
    weightedPercentages <- maxPercentages * rowSums(t)/sum(t)
  else 
    weightedPercentages <- maxPercentages/nrow(t)
  
  return(c(sum(weightedPercentages), min(maxPercentages), 
           sum(maxPercentages<0.75)))
}

#' Calculate distance matrix using a minimal spanning tree neighborhood
#'
#' @param  X matrix in which each row represents a point
#'
#' @return Distance matrix
Dist.MST <- function(X){
  adjacency <- stats::dist(X, method = "euclidean")
  fullGraph <- igraph::graph.adjacency(as.matrix(adjacency), 
                                       mode = "undirected", 
                                       weighted = TRUE)
  mst <- igraph::minimum.spanning.tree(fullGraph)
  return(igraph::shortest.paths(mst, v = igraph::V(mst), to = igraph::V(mst), 
                                weights = NA))
}

#' Write FlowSOM clustering results to the original FCS files
#'
#' @param  fsom              FlowSOM object as generated by BuildSOM
#' @param  originalFiles     FCS files that should be extended
#' @param  preprocessedFiles FCS files that correspond to the input of FlowSOM,
#'                           If NULL (default), the originalFiles are used.
#' @param  selectionColumn   Column of the FCS file indicating the original cell
#'                           ids. If NULL (default), no selection is made.
#' @param  silent            If FALSE (default), print some extra output
#' @param  outputDir         Directory to save the FCS files. Default to the
#'                           current working directory (".")
#' @param  suffix            Suffix added to the filename. Default _FlowSOM.fcs
#' @param ...                Options to pass on to the read.FCS function (e.g.
#'                           truncate_max_range)
#'
#' @return Saves the extended FCS file as [originalName]_FlowSOM.fcs
#'
#' @examples
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' flowSOM.res <- FlowSOM(fileName, compensate = TRUE, transform = TRUE,
#'                       scale = TRUE, colsToUse = c(9, 12, 14:18), nClus = 10)
#' SaveClustersToFCS(flowSOM.res, fileName)
#' 
#' @importFrom stats runif
#' @export 
SaveClustersToFCS <- function(fsom, 
                              originalFiles, 
                              preprocessedFiles = NULL, 
                              selectionColumn = NULL, 
                              silent = FALSE,
                              outputDir = ".",
                              suffix = "_FlowSOM.fcs",
                              ...){
  if (length(preprocessedFiles) != 0 & length(preprocessedFiles)!= length(originalFiles)){ 
    stop("The vector of preprocessedFiles should be the same length as ",
         "the originalFiles when provided.")
  }
  
  for(i in seq_along(originalFiles)){
    if(!silent){message("Mapping ", originalFiles[i])}
    
    ff_o <- flowCore::read.FCS(originalFiles[i], ...)
    if(!is.null(preprocessedFiles)){
      ff <- flowCore::read.FCS(preprocessedFiles[i], ...)
    } else {
      ff <- ff_o
    }
    
    if(!is.null(selectionColumn)){
      s <- flowCore::exprs(ff)[, selectionColumn]
    } else {
      s <- seq_len(nrow(ff_o))    
    }
    
    # Map the data
    fsom_f <- NewData(fsom, ff)
    
    # Put on corresponding indices
    newCols <- ifelse(is.null(fsom$metaclustering), 3, 4)
    m <- matrix(0, nrow = nrow(ff_o), ncol = newCols)
    
    m[s, 1] <- GetClusters(fsom_f)
    
    spread <- min(dist(fsom_f$MST$l))/3
    m[s, 2] <- fsom_f$MST$l[,1][GetClusters(fsom_f)] + 
      stats::runif(length(s), min = -spread, max = spread)
    m[s, 3] <- fsom_f$MST$l[,2][GetClusters(fsom_f)] + 
      stats::runif(length(s), min = -spread, max = spread)
    if(!is.null(fsom_f$metaclustering)){
      m[s, 4] <- GetMetaclusters(fsom_f)
    }
    colnames(m) <- c("FlowSOM_cluster", "FlowSOM_x", "FlowSOM_y", "FlowSOM_meta")[seq_len(newCols)]
    
    # Save as FCS file
    ff_o <- flowCore::fr_append_cols(ff_o, m)
    outputFile <- file.path(outputDir,
                            gsub("\\.fcs$",
                                 suffix,
                                 basename(originalFiles[i])))
    flowCore::write.FCS(ff_o, 
                        filename = outputFile)  
    
    if(!silent){message("Result written to ", outputFile)}
  }
}

#' Get cluster label for all individual cells
#'
#' @param  fsom             FlowSOM object as generated by the FlowSOM function
#'                          or the BuildSOM function
#'                          
#' @return vector label for every cell
#' @examples 
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' flowSOM.res <- FlowSOM(fileName, compensate = TRUE, transform = TRUE,
#'                       scale = TRUE, colsToUse = c(9, 12, 14:18), nClus = 10)
#' cluster_labels <- GetClusters(flowSOM.res)
#'
#' @export 
GetClusters <- function(fsom) {
  fsom <- UpdateFlowSOM(fsom)
  return(fsom$map$mapping[, 1])
}

#' Get metacluster label for all individual cells
#'
#' @param  fsom             FlowSOM object as generated by the FlowSOM function
#'                          or the BuildSOM function
#' @param  meta             Metacluster label for each FlowSOM cluster. If this
#'                          is NULL, the fsom argument should be as generated by
#'                          the FlowSOM function, and fsom$metaclustering will
#'                          be used.                          
#' @return vector label for every cell
#' @examples 
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' flowSOM.res <- FlowSOM(fileName, compensate = TRUE, transform = TRUE,
#'                       scale = TRUE, colsToUse = c(9, 12, 14:18), nClus = 10)
#' metacluster_labels <- GetMetaclusters(flowSOM.res)
#' metacluster_labels <- GetMetaclusters(flowSOM.res,
#'                                       meta = flowSOM.res$metaclustering)
#'
#' @export 
GetMetaclusters <- function(fsom, meta = NULL){
  fsom <- UpdateFlowSOM(fsom)
  if(is.null(meta)) meta <- fsom$metaclustering
  return(meta[GetClusters(fsom)])
} 

#' Get MFI values for all clusters
#'
#' @param  fsom             FlowSOM object as generated by the FlowSOM function
#'                          or the BuildSOM function
#' @param  colsUsed         logical. Should report only the columns used to 
#'                          build the SOM. Default = FALSE.
#' @param  prettyColnames   logical. Should report pretty column names instead
#'                          of standard column names. Default = FALSE.
#'                          
#' @return Matrix with median values for each marker
#'
#' @examples 
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' flowSOM.res <- FlowSOM(fileName, compensate = TRUE, transform = TRUE,
#'                       scale = TRUE, colsToUse = c(9, 12, 14:18), nClus = 10)
#' mfis <- GetClusterMFIs(flowSOM.res)
#' @export 
GetClusterMFIs <- function(fsom, colsUsed = FALSE, prettyColnames = FALSE){
  fsom <- UpdateFlowSOM(fsom)
  MFIs <- fsom$map$medianValues
  rownames(MFIs) <- seq_len(nrow(MFIs))
  if(is.null(fsom$map$colsUsed)) colsUsed <- FALSE
  if(is.null(fsom$prettyColnames)) prettyColnames <- FALSE
  if(colsUsed && !prettyColnames){
    MFIs <- MFIs[, fsom$map$colsUsed]
  } else if(!colsUsed && prettyColnames) {
    colnames(MFIs) <- fsom$prettyColnames
  } else if(colsUsed && prettyColnames) {
    MFIs <- MFIs[, fsom$map$colsUsed]
    colnames(MFIs) <- fsom$prettyColnames[fsom$map$colsUsed]
  }
  return(MFIs)
}

#' Get percentage-positive values for all clusters
#'
#' @param  fsom             FlowSOM object as generated by the FlowSOM function
#'                          or the BuildSOM function
#' @param  cutoffs          named numeric vector. Upper bounds of negative
#'                          population fluorescence-intensity values for each
#'                          marker / channel.
#' @param  colsUsed         logical. Should report only the columns used to 
#'                          build the SOM. Default = FALSE.
#' @param  prettyColnames   logical. Should report pretty column names instead
#'                          of standard column names. Default = FALSE.
#'                          
#' @return Matrix with percentages of cells that are positive in selected markers per each cluster
#'
#' @examples 
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' flowSOM.res <- FlowSOM(fileName, compensate = TRUE, transform = TRUE,
#'                       scale = TRUE, colsToUse = c(9, 12, 14:18), nClus = 10)
#' perc_pos <- GetClusterPercentagesPositive(flowSOM.res, cutoffs = c('CD4' = 5000))
#' @export 
GetClusterPercentagesPositive <- function(fsom, cutoffs, colsUsed = FALSE, prettyColnames = FALSE){
  fsom <- UpdateFlowSOM(fsom)
  cl_per_cell <- GetClusters(fsom)
  clusters <- seq_len(NClusters(fsom))
  
  if(is.null(fsom$map$colsUsed)) colsUsed <- FALSE
  if(is.null(fsom$prettyColnames)) prettyColnames <- FALSE
  
  channels <- GetChannels(fsom, names(cutoffs))
  if(colsUsed && !prettyColnames){
    channels <- intersect(channels, fsom$map$colsUsed)
  }
  
  perc_pos <- matrix(NA,
                     nrow = length(clusters),
                     ncol = length(cutoffs))
  rownames(perc_pos) <- clusters
  colnames(perc_pos) <- names(cutoffs)
  
  i <- 0
  for (cluster in clusters){
    i <- i + 1
    
    data_per_cluster <- fsom$data[cl_per_cell == cluster, channels, drop = FALSE]
    
    if (length(data_per_cluster) > 0){
      data_per_cluster <- split(data_per_cluster, col(data_per_cluster))
      names(data_per_cluster) <- channels
      npoints <- length(data_per_cluster[[1]])
      perc_pos_per_channel <- mapply(function(data_per_channel, cutoff) sum(data_per_channel > cutoff) / npoints,
                                     data_per_cluster,
                                     cutoffs)
      perc_pos[i, ] <- perc_pos_per_channel
    }
  }
  if (prettyColnames){
    colnames(perc_pos) <- fsom$prettyColnames[channels]
  }
  return(perc_pos)
}

#' Get CV values for all clusters
#'
#' @param  fsom             FlowSOM object as generated by the FlowSOM function
#'                          or the BuildSOM function
#'                          
#' @return Matrix with coefficient of variation values for each marker
#' 
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' flowSOM.res <- FlowSOM(fileName, compensate = TRUE, transform = TRUE,
#'                       scale = TRUE, colsToUse = c(9, 12, 14:18), nClus = 10)
#' cvs <- GetClusterCVs(flowSOM.res)
#'
#' @export
GetClusterCVs <- function(fsom){
  fsom <- UpdateFlowSOM(fsom)
  return(fsom$map$cvValues)
}

#' GetFeatures
#' 
#' Map FCS files on an existing FlowSOM object
#'
#' @param  fsom             FlowSOM object as generated by the FlowSOM function
#'                          or the BuildSOM function
#' @param  files            Either a vector of FCS files or paths to FCS files  
#' @param  level            Level(s) of interest. Default is c("clusters",
#'                          "metaclusters"), but can also be only one of them. 
#'                          Can be abbreviated.
#' @param  type             Type of features to extract. Default is "counts", 
#'                          can be a vector of "counts", "percentages", "MFIs"
#'                          and/or "percentages_positive" or abbreviations.
#' @param  MFI              Vector with channels / markers for which the MFI 
#'                          values must be returned when "MFIs" is in \code{type}
#' @param  positive_cutoffs Named vector with fluorescence-intensity values
#'                          per channel / marker that are the upper bounds for
#'                          a negative population when "percentages_positive" is
#'                          in \code{type}
#' @param  filenames        An optional vector with filenames that will be used
#'                          as rownames in the count matrices. If NULL (default)
#'                          either the paths will be used or a numerical vector.
#' @param  silent           Logical. If \code{TRUE}, print progress messages.
#'                          Default = \code{FALSE}.
#' 
#' @return matrix with features per population - type combination
#'         
#' @examples 
#'  # Build FlowSom result
#'  fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#'  ff <- flowCore::read.FCS(fileName)
#'  ff <- flowCore::compensate(ff, flowCore::keyword(ff)[["SPILL"]])
#'  ff <- flowCore::transform(ff,
#'          flowCore::transformList(colnames(flowCore::keyword(ff)[["SPILL"]]),
#'                                 flowCore::logicleTransform()))
#'  flowSOM.res <- FlowSOM(ff[1:1000, ], 
#'                         scale = TRUE, 
#'                         colsToUse = c(9, 12, 14:18),
#'                         nClus = 10)
#'    
#'  # Map new data
#'  counts <- GetFeatures(fsom = flowSOM.res, 
#'                        level = "clusters",
#'                        files = c(ff[1001:2000, ], ff[2001:3000, ]))
#'  features <- GetFeatures(fsom = flowSOM.res, 
#'                          files = c(ff[1001:2000, ], ff[2001:3000, ]),
#'                          type = c("counts", "percentages", "MFIs"), 
#'                          MFI = "APC-A", 
#'                          filenames = c("ff_1001-2000", "ff_2001-3000"))
#'
#'  # Get percentages of positive cells
#'  positive_cutoffs <- c('CD8' = 1.5,
#'                        'CD4' = 0.3,
#'                        'CD19' = 1.3,
#'                        'CD3' = -0.3)
#'  
#'  perc_pos <- GetFeatures(fsom = flowSOM.res, 
#'                          files = c(ff[1001:2000, ], ff[2001:3000, ]),
#'                          type = c("percentages_positive"), 
#'                          positive_cutoffs = positive_cutoffs,
#'                          filenames = c("ff_1001-2000", "ff_2001-3000"))
#'                          
#' @export 
GetFeatures <- function(fsom, 
                        files, 
                        level = c("clusters", "metaclusters"),
                        type = "counts", 
                        MFI = NULL, 
                        positive_cutoffs = NULL,
                        filenames = NULL, 
                        silent = FALSE) {
  nclus <- NClusters(fsom)
  nfiles <- length(files)
  i <- 0
  level <- pmatch(level, c("clusters", "metaclusters"))
  type <- pmatch(type, c("counts", "percentages", "MFIs", "percentages_positive"))
  
  #----Warnings----
  if (!is.null(filenames) & length(filenames) != nfiles){
    stop("\"filenames\" vector should have same length as \"files\" vector.")
  }
  
  if (any(is.na(level)) | length(level) > 2){
    stop("\"level\" should be \"clusters\" and/or \"metaclusters\".")
  } else {level <- c("clusters", "metaclusters")[level]}
  
  if (any(is.na(type)) | length(type) > 4){
    stop("\"type\" should be \"counts\", \"percentages\", \"MFIs\" and/or \"percentages_positive\".")
  } else {type <- c("counts", "percentages", "MFIs", "percentages_positive")[type]}
  
  if ("MFIs" %in% type & is.null(MFI)){
    stop("Please provide channel names for MFI calculation")
  }
  
  if ("percentages_positive" %in% type & is.null(positive_cutoffs)) {
    stop("Please provide positive cutoffs for percentage-positive calculation")
  }
  
  if (!is.null(positive_cutoffs) & length(names(positive_cutoffs)) == 0){
    stop("\"positive_cutoffs\" must be a named vector")
  }
  
  matrices <- list()
  
  #----Prepare matrices----
  if (is.null(filenames)) {
    if (is.character(files)) {
      filenames <- files
    } else {
      filenames <- as.character(seq_len(length(files)))
    }
  }
  C_counts <- matrix(data = 0,
                     nrow = nfiles,
                     ncol = nclus,
                     dimnames = list(filenames, 
                                     paste0("C", seq_len(nclus))))
  C_outliers <- matrix(data = 0,
                       nrow = nfiles,
                       ncol = nclus,
                       dimnames = list(filenames, 
                                       paste0("C", seq_len(nclus))))
  
  if ("MFIs" %in% type) {
    nmetaclus <- NMetaclusters(fsom)
    MFI <- GetChannels(fsom, MFI)
    nmarker <- length(MFI)
    C_MFIs <- matrix(NA,
                     nrow = nfiles,
                     ncol = nmarker * nclus,
                     dimnames = list(filenames,
                                     paste0(rep(paste0("C", seq_len(nclus)), 
                                                each = nmarker), 
                                            " ", fsom$prettyColnames[MFI])))
    MC_MFIs <- matrix(NA,
                      nrow = nfiles,
                      ncol = nmarker * nmetaclus,
                      dimnames = list(filenames,
                                      paste0(rep(paste0("MC", 
                                                        levels(fsom$metaclustering)), 
                                                 each = nmarker), 
                                             " ", fsom$prettyColnames[MFI])))
  }
  
  if ("percentages_positive" %in% type) {
    nmetaclus <- NMetaclusters(fsom)
    positive_cutoffs <- unlist(positive_cutoffs)
    perc_pos <- GetChannels(fsom, names(positive_cutoffs))
    nmarker <- length(perc_pos)
    C_perc_pos <- matrix(NA,
                         nrow = nfiles,
                         ncol = nmarker * nclus,
                         dimnames = list(filenames,
                                         paste0(rep(paste0("C", seq_len(nclus)), 
                                                    each = nmarker), 
                                                " ", fsom$prettyColnames[perc_pos])))
    MC_perc_pos <- matrix(NA,
                          nrow = nfiles,
                          ncol = nmarker * nmetaclus,
                          dimnames = list(filenames,
                                          paste0(rep(paste0("MC", 
                                                            levels(fsom$metaclustering)), 
                                                     each = nmarker), 
                                                 " ", fsom$prettyColnames[perc_pos])))
  }
  
  #----Loop over files----
  for (file in files){
    i <- i + 1
    if (isFALSE(silent)){
      message(paste0("Mapping file ", i, " of ", nfiles, "."))
    }
    fsom_tmp <- NewData(fsom = fsom,
                        input = file,
                        silent = silent)
    
    counts_t <- table(GetClusters(fsom_tmp))
    C_counts[i, paste0("C", names(counts_t))] <- counts_t
    outliers_t <- fsom_tmp$outliers[, "Number_of_outliers", drop = FALSE]
    if (nrow(outliers_t) != 0) {
      C_outliers[i, paste0("C", rownames(outliers_t))] <- 
        outliers_t$Number_of_outliers
    }
    
    if ("MFIs" %in% type){
      if ("clusters" %in% level){
        C_MFIs[i, ] <- as.vector(t(GetClusterMFIs(fsom_tmp)[, MFI]))
      }
      if ("metaclusters" %in% level){
        MC_MFIs[i, ] <- as.vector(t(GetMetaclusterMFIs(fsom_tmp)[, MFI]))
      }
    }
    
    if ("percentages_positive" %in% type){
      if ("clusters" %in% level){
        C_perc_pos[i, ] <- as.vector(t(GetClusterPercentagesPositive(fsom_tmp, cutoffs = positive_cutoffs)))
      }
      if ("metaclusters" %in% level){
        MC_perc_pos[i, ] <- as.vector(t(GetMetaclusterPercentagesPositive(fsom_tmp, cutoffs = positive_cutoffs)))
      }
    }
  }
  
  #----Add matrices to list----
  
  matrices[["outliers"]] <- C_outliers
  
  if ("clusters" %in% level){
    if ("counts" %in% type){
      C_counts_tmp <- C_counts
      attr(C_counts_tmp, "outliers") <- C_outliers
      matrices[["cluster_counts"]] <- C_counts_tmp
    }
    if ("percentages" %in% type){
      C_pctgs <- prop.table(C_counts, margin = 1)
      colnames(C_pctgs) <- paste0("%", colnames(C_pctgs))
      attr(C_pctgs, "outliers") <- C_outliers
      matrices[["cluster_percentages"]] <- C_pctgs
    }
    if ("MFIs" %in% type){
      matrices[["cluster_MFIs"]] <- C_MFIs
    }
    if ("percentages_positive" %in% type){
      matrices[["cluster_percentages_positive"]] <- C_perc_pos
    }
  }
  
  if ("metaclusters" %in% level){
    MC_counts <- t(apply(C_counts,
                         1,
                         function(x){
                           tapply(x, fsom$metaclustering, sum)
                         }))
    MC_counts[is.na(MC_counts)] <- 0
    colnames(MC_counts) <- paste0("MC", colnames(MC_counts))
    
    if ("counts" %in% type){
      matrices[["metacluster_counts"]] <- MC_counts
    }
    if ("percentages" %in% type){
      MC_pctgs <- prop.table(MC_counts, margin = 1)
      colnames(MC_pctgs) <- paste0("%", colnames(MC_pctgs))
      matrices[["metacluster_percentages"]] <- MC_pctgs
    }
    if ("MFIs" %in% type){
      matrices[["metacluster_MFIs"]] <- MC_MFIs
    }
    if ("percentages_positive" %in% type){
      matrices[["metacluster_percentages_positive"]] <- MC_perc_pos
    }
  }
  
  return(matrices)
} 

#' GroupStats 
#' 
#' Calculate statistics between 2 groups based on the \code{\link{GetFeatures}}
#' output
#' 
#' @param features Feature matrix as generated by \code{\link{GetFeatures}}, 
#'                 e.g. a percentages matrix
#' @param groups   Named list with file or patient IDs per group (should match
#'                 with the rownames of the \code{matrix}).
#'                          
#' @return Matrix with the medians per group, the p-values (the raw, Benjamini 
#' Hochberg corrected one and the -log10) that resulted from a Wilcox test and 
#' the fold and log10 fold changes between the medians of the 2 groups
#' 
#' @examples 
#' # Build FlowSom result
#'  fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#'  ff <- flowCore::read.FCS(fileName)
#'  ff <- flowCore::compensate(ff, flowCore::keyword(ff)[["SPILL"]])
#'  ff <- flowCore::transform(ff,
#'          flowCore::transformList(colnames(flowCore::keyword(ff)[["SPILL"]]),
#'                                 flowCore::logicleTransform()))
#'  flowSOM.res <- FlowSOM(ff, scale = TRUE, colsToUse = c(9, 12, 14:18),
#'                           nClus = 10)
#'    
#' # Create new data
#' # To illustrate the output, we here generate new FCS files (with more 
#' # cells in metaclusters 1 and 9).
#' # In practice you would not generate any new file but use your different
#' # files from your different groups
#'  flowCore::write.FCS(ff[sample(1:nrow(ff), 1000), ], file = "ff_tmp1.fcs")
#'  flowCore::write.FCS(ff[sample(1:nrow(ff), 1000), ], file = "ff_tmp2.fcs")
#'  flowCore::write.FCS(ff[sample(1:nrow(ff), 1000), ], file = "ff_tmp3.fcs")
#'  ff_tmp <- ff[c(1:1000,
#'                 which(flowSOM.res$map$mapping[, 1] %in% 
#'                      which(flowSOM.res$metaclustering == 9)),
#'                 which(flowSOM.res$map$mapping[, 1] %in% 
#'                      which(flowSOM.res$metaclustering == 1))), ]
#'  flowCore::write.FCS(ff_tmp[sample(1:nrow(ff_tmp), 1000), ],
#'                      file = "ff_tmp4.fcs")
#'  flowCore::write.FCS(ff_tmp[sample(1:nrow(ff_tmp), 1000), ], 
#'                      file = "ff_tmp5.fcs")
#'  
#' # Get the count matrix
#'  percentages <- GetFeatures(fsom = flowSOM.res, 
#'                             files = c("ff_tmp1.fcs", 
#'                                       "ff_tmp2.fcs", 
#'                                       "ff_tmp3.fcs",
#'                                       "ff_tmp4.fcs", 
#'                                       "ff_tmp5.fcs"), 
#'                             type = "percentages")
#'                        
#'   
#' # Perform the statistics
#' groups <- list("Group 1" = c("ff_tmp1.fcs", "ff_tmp2.fcs", "ff_tmp3.fcs"), 
#'                "Group 2" = c("ff_tmp4.fcs", "ff_tmp5.fcs"))
#' MC_stats <- GroupStats(percentages[["metacluster_percentages"]], groups)
#' C_stats <- GroupStats(percentages[["cluster_percentages"]], groups)
#' 
#' # Process the fold changes vector
#' fold_changes <- C_stats["fold changes", ]
#' fold_changes <- factor(ifelse(fold_changes < -3, 
#'                               "Underrepresented compared to Group 1",
#'                               ifelse(fold_changes > 3, 
#'                                      "Overrepresented compared to Group 1",
#'                                       "--")), 
#'                         levels = c("--", 
#'                                    "Underrepresented compared to Group 1",
#'                                    "Overrepresented compared to Group 1"))
#' fold_changes[is.na(fold_changes)] <- "--"
#' 
#' # Show in figure
#' ## Fold change
#' gr_1 <- PlotStars(flowSOM.res, 
#'                   title = "Group 1", 
#'                   nodeSizes = C_stats["medians Group 1", ], 
#'                   list_insteadof_ggarrange = TRUE)
#' gr_2 <- PlotStars(flowSOM.res, title = "Group 2", 
#'             nodeSizes = C_stats["medians Group 2", ], 
#'             backgroundValues = fold_changes,
#'             backgroundColors = c("white", "red", "blue"), 
#'             list_insteadof_ggarrange = TRUE)
#' p <- ggpubr::ggarrange(plotlist = c(list(gr_1$tree), gr_2),
#'                        heights = c(3, 1))
#' ggplot2::ggsave("Groups_foldchanges.pdf", p, width = 10)
#' 
#' ## p values
#' p <- PlotVariable(flowSOM.res, title = "Wilcox test group 1 vs. group 2",
#' variable = C_stats["p values", ])
#' ggplot2::ggsave("Groups_pvalues.pdf", p)
#' 
#' ## volcano plot
#' p <- ggplot2::ggplot(data.frame("-log10 p values" = c(C_stats[4, ], 
#'                                                       MC_stats[4, ]), 
#'                                 "log10 fold changes" = c(C_stats[7, ],
#'                                                          MC_stats[7, ]), 
#' check.names = FALSE), ggplot2::aes(x = `log10 fold changes`, 
#'                                    y = `-log10 p values`)) +
#' ggplot2::xlim(-3, 3) +
#' ggplot2::ylim(0, 3) +
#' ggplot2::geom_point() 
#' 
#' @importFrom stats p.adjust wilcox.test
#' 
#' @export
GroupStats <- function(features, groups){
  nGroups <- lapply(groups, 
                    length)
  
  #----Warnings----
  if (length(groups) != 2 | !is.list(groups)){
    stop("Groups should be a named list with 2 groups")
  }
  
  if (!all(c(groups[[1]], groups[[2]]) %in% rownames(features))){
    stop("File or patient IDs from groups should correspond to ",
         "the features' rownames")
  }
  
  
  #----Calculate medians per group----
  i <- rep(NA, nrow(features))
  for (group in names(groups)){
    i[rownames(features) %in% groups[[group]]] <- group
  }
  
  medians <- apply(features, 2, function(x) {
    tapply(x, 
           INDEX = factor(i, 
                          levels = names(groups)), 
           stats::median, 
           na.rm = TRUE)
  })
  
  #----Calculate fold changes between groups----
  fold_change <- c()
  for (col in seq_len(ncol(medians))){
    m <- medians[, col]
    if (any(m <= 0) | any(is.na(m))){
      fold_change <- c(fold_change, NA)
    } else{
      fold_change <- c(fold_change, 
                       max(m, na.rm = TRUE) / 
                         min(m, na.rm = TRUE) * 
                         (-1) ^ which.max(m))
    }
  }
  
  logfold <- sign(fold_change)*log10(abs(fold_change))
  
  #----Perform Wilcoxon test between groups----
  pValues <- c()
  for (col in seq_len(ncol(features))){
    if(! (all(is.na(features[c(groups[[1]], groups[[2]]), col])) | 
          all(features[c(groups[[1]], groups[[2]]), col] == 0))){
      test <- stats::wilcox.test(features[groups[[1]], col], 
                                 features[groups[[2]], col],
                                 exact = FALSE)
      pValues <- c(pValues, test$p.value)
    } else {
      pValues <- c(pValues, 1)
    }
    
  }
  adjustedP <- stats::p.adjust(pValues, "BH")
  logP <- -log10(pValues)
  
  stats <- rbind(medians, pValues, logP, adjustedP, fold_change, logfold)
  rownames(stats) <- c(paste0("medians ", names(groups)), 
                       "p values",
                       "-log10 p values", 
                       "adjusted p values",
                       "fold changes", 
                       "log10 fold changes")
  return(stats)
}

#' GetCounts 
#' 
#' Get counts of number of cells in clusters or metaclusters
#' 
#' @param fsom        FlowSOM object
#' @param level       Character string, should be either "clusters" or 
#'                    "metaclusters" (default) or abbreviations.
#' 
#' @return A named vector with the counts
#' 
#' @examples 
#' # Read from file, build self-organizing map and minimal spanning tree
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' ff <- flowCore::read.FCS(fileName)
#' ff <- flowCore::compensate(ff, flowCore::keyword(ff)[["SPILL"]])
#' ff <- flowCore::transform(ff, flowCore::estimateLogicle(ff,
#'                                                flowCore::colnames(ff)[8:18]))
#' flowSOM.res <- FlowSOM(ff,
#'                        scale = TRUE,
#'                        colsToUse = c(9, 12, 14:18),
#'                        nClus = 10,
#'                        seed = 1)
#' GetCounts(flowSOM.res)                      
#' GetCounts(flowSOM.res, level = "clusters")
#' @export
GetCounts <- function(fsom, level = "metaclusters"){
  level <- pmatch(level, c("metaclusters", "clusters"))
  fsom <- UpdateFlowSOM(fsom)
  if (length(level) > 1 | any(is.na(level))){
    stop("level should be \"clusters\" or \"metaclusters\"")
  } else {
    level <- c("metaclusters", "clusters")[level]
    if (!is.null(fsom$metaclustering) && level == "metaclusters"){
      counts <- rep(0, NMetaclusters(fsom))
      names(counts) <- paste("MC", levels(fsom$metaclustering))
      tmp <- table(GetMetaclusters(fsom))
      counts[paste("MC", names(tmp))] <- tmp
    } else if (level == "clusters"){
      counts <- rep(0, NClusters(fsom))
      names(counts) <- paste("C", seq_len(NClusters(fsom)))
      tmp <- table(GetClusters(fsom))
      counts[paste("C", names(tmp))] <- tmp
    }
  }
  return(counts)
}

#' GetPercentages
#' 
#' Get percentages of number of cells in clusters or metaclusters
#' 
#' @param fsom        FlowSOM object
#' @param level       Character string, should be either "clusters" or 
#'                    "metaclusters" (default) or abbreviations.
#' 
#' @return A named vector with the percentages
#' 
#' @examples 
#' # Read from file, build self-organizing map and minimal spanning tree
#' fileName <- system.file("extdata", "68983.fcs", package = "FlowSOM")
#' ff <- flowCore::read.FCS(fileName)
#' ff <- flowCore::compensate(ff, flowCore::keyword(ff)[["SPILL"]])
#' ff <- flowCore::transform(ff, flowCore::estimateLogicle(ff,
#'                                                flowCore::colnames(ff)[8:18]))
#' flowSOM.res <- FlowSOM(ff,
#'                        scale = TRUE,
#'                        colsToUse = c(9, 12, 14:18),
#'                        nClus = 10,
#'                        seed = 1)
#' GetPercentages(flowSOM.res)                      
#' GetPercentages(flowSOM.res, level = "clusters")
#' @export
GetPercentages <- function(fsom, level = "metaclusters"){
  counts <- GetCounts(fsom, level = level)
  return(counts / sum(counts, na.rm = TRUE))
}
saeyslab/FlowSOM documentation built on July 6, 2024, 10:59 a.m.