R/CA.R

Defines functions elbow_method scree_plot ca_coords subset_dims run_cacomp var_rows rm_zeros comp_std_residuals

Documented in ca_coords comp_std_residuals elbow_method rm_zeros run_cacomp scree_plot subset_dims var_rows

#' @include constructor.R
NULL

#' Compute Standardized Residuals
#'
#' @description
#' `comp_std_residuals` computes the standardized residual matrix S,
#' which is the basis for correspondence analysis and serves
#' as input for singular value decomposition (SVD).
#'
#' @details
#' Calculates standardized residual matrix S from the proportion matrix P and
#' the expected values E according to \eqn{S = \frac{(P-E)}{sqrt(E)}}.
#'
#' @param mat A numerical matrix or coercible to one by `as.matrix()`.
#' Should have row and column names.
#' @return
#' A named list with standardized residual matrix "S",
#' grand total of the original matrix "tot"
#' as well as row and column masses "rowm" and "colm" respectively.
#'
comp_std_residuals <- function(mat) {

  if (!is(mat, "matrix") & !is(mat, "dgCMatrix")) {
    mat <- as.matrix(mat)
  }
  stopifnot(
    "Input matrix does not have any rownames!" = !is.null(rownames(mat)))
  stopifnot(
    "Input matrix does not have any colnames!" = !is.null(colnames(mat)))

  tot <- sum(mat)
  P <- mat/tot               # proportions matrix
  rowm <- Matrix::rowSums(P)          # row masses
  colm <- Matrix::colSums(P)          # column masses

  E <- rowm %o% colm      # expected proportions
  S <-  (P - E) / sqrt(E)         # standardized residuals
  S[is.nan(S)] <- 0

  out <- list("S"=S, "tot"=tot, "rowm"=rowm, "colm"=colm)
  return(out)
}

#' removes 0-only rows and columns in a matrix.
#'
#' @param obj A matrix.
#' @return Input matrix with rows & columns consisting of only 0 removed.
rm_zeros <- function(obj){
  stopifnot(is(obj, "matrix") | is(obj, "dgCMatrix"))

  no_zeros_rows <- Matrix::rowSums(obj) > 0
  no_zeros_cols <- Matrix::colSums(obj) > 0
  if (sum(!no_zeros_rows) != 0){
    ## Delete genes with only zero values across all columns
    warning("Matrix contains rows with only 0s. ",
            "These rows were removed. ",
            "If undesired set rm_zeros = FALSE.")
    obj <- obj[no_zeros_rows,]
  }
  if (sum(!no_zeros_cols) != 0){
    ## Delete cells with only zero values across all genes
    warning("Matrix contains columns with only 0s. ",
            "These columns were removed. ",
            "If undesired set rm_zeros = FALSE.")
    obj <- obj[,no_zeros_cols]
  }

  return(obj)
}
#' Find most variable rows
#'
#' @description
#' Calculates the variance of the chi-square component matrix and selects the 
#' rows with the highest variance, e.g. 5,000.
#'
#' @return
#' Returns a matrix, which consists of the top variable rows of mat.
#'
#' @param mat A numeric matrix. For sequencing a count matrix,
#' gene expression values with genes in rows and samples/cells in columns.
#' Should contain row and column names.
#' @param top Integer. Number of most variable rows to retain. Default 5000.
#' @export
#' @examples
#' set.seed(1234)
#'
#' # Simulate counts
#'cnts <- mapply(function(x){rpois(n = 500, lambda = x)},
#'               x = sample(1:20, 50, replace = TRUE))
#' rownames(cnts) <- paste0("gene_", 1:nrow(cnts))
#' colnames(cnts) <- paste0("cell_", 1:ncol(cnts))
#'
#' # Choose top 5000 most variable genes
#' cnts <- var_rows(mat = cnts, top = 5000)
#'
#'
var_rows <- function(mat, top = 5000){

  res <-  comp_std_residuals(mat=mat)

  if(top>nrow(mat)) {
    warning("Top is larger than the number of rows in matrix. ",
            "Top was set to nrow(mat).")
  }
  
  top <- min(nrow(mat), top)

  chisquare <- res$tot * (res$S^2)		# chi-square components matrix
  variances <- apply(chisquare,1,var) #row-wise variances
  ix_var <- order(-variances)
  mat <- mat[ix_var[seq_len(top)],] # choose top rows
  return(mat)

}


#' Internal function for `cacomp`
#'
#' @description
#' `run_cacomp` performs correspondence analysis on a matrix and returns the 
#' transformed data.
#'
#' @details
#' The calculation is performed according to the work of Michael Greenacre. 
#' Singular value decomposition
#' can be performed either with the base R function `svd` or preferably by the 
#' faster
#' pytorch implementation (python = TRUE). When working with large matrices, 
#' CA coordinates and
#' principal coordinates should only be computed when needed to save 
#' computational time.
#'
#' @return
#' Returns a named list of class "cacomp" with components
#' U, V and D: The results from the SVD.
#' row_masses and col_masses: Row and columns masses.
#' top_rows: How many of the most variable rows/genes were retained for the 
#' analysis.
#' tot_inertia, row_inertia and col_inertia: Only if inertia = TRUE. Total, 
#' row and column inertia respectively.
#' @references
#' Greenacre, M. Correspondence Analysis in Practice, Third Edition, 2017.

#' @param obj A numeric matrix or Seurat/SingleCellExperiment object. For 
#' sequencing a count matrix, gene expression values with genes in rows and 
#' samples/cells in columns.
#' Should contain row and column names.
#' @param coords Logical. Indicates whether CA standard coordinates should be 
#' calculated. Default TRUE
#' @param python A logical value indicating whether to use singular-value 
#' decomposition from the python package torch.
#' This implementation dramatically speeds up computation compared to `svd()` 
#' in R.
#' @param princ_coords Integer. Number indicating whether principal 
#' coordinates should be calculated for the rows (=1), columns (=2), 
#' both (=3) or none (=0).
#' Default 1.
#' @param dims Integer. Number of CA dimensions to retain. Default NULL 
#' (keeps all dimensions).
#' @param top Integer. Number of most variable rows to retain. Default 5000.
#' @param inertia Logical.. Whether total, row and column inertias should be 
#' calculated and returned. Default TRUE.
#' @param rm_zeros Logical. Whether rows & cols containing only 0s should be 
#' removed. Keeping zero only rows/cols might lead to unexpected results. 
#' Default TRUE.
#' @param ... Arguments forwarded to methods.
run_cacomp <- function(obj,
                   coords = TRUE,
                   princ_coords = 3,
                   python = FALSE,
                   dims = NULL,
                   top = 5000,
                   inertia = TRUE,
                   rm_zeros = TRUE,
                   ...){

  stopifnot("Input matrix does not have any rownames!" =
              !is.null(rownames(obj)))
  stopifnot("Input matrix does not have any colnames!" = 
              !is.null(colnames(obj)))

  if (rm_zeros == TRUE){
    obj <- rm_zeros(obj)
  }


  # Choose only top # of variable genes
  if (is.null(top) || top == nrow(obj)) {
    res <-  comp_std_residuals(mat=obj)
    toptmp <- nrow(obj)
  } else if (!is.null(top) && top < nrow(obj)){

    obj <- var_rows(mat = obj, top = top)
    res <-  comp_std_residuals(mat=obj)
    toptmp <- top
    
  } else if (top > nrow(obj)) {
    warning("\nParameter top is >nrow(obj) and therefore ignored.")
    res <-  comp_std_residuals(mat=obj)
    toptmp <- nrow(obj)
  } else {
    warning("\nUnusual input for top, argument ignored.")
    res <-  comp_std_residuals(mat=obj)
    toptmp <- nrow(obj)
  }

  S <- res$S
  tot <- res$tot
  rowm <- res$rowm
  colm <- res$colm
  rm(res)

  k <- min(dim(S))-1

  if (is.null(dims)) dims <- k
  if (dims > k) dims <- k
  # S <- (diag(1/sqrt(r)))%*%(P-r%*%t(c))%*%(diag(1/sqrt(c)))
  # message("Running singular value decomposition ...")

  if (python == TRUE) {

    # python implementation currently can only handle dense matrices of the base class.
    if (!is(S, "matrix")) {
      S <- as.matrix(S)
    }

    svd_torch <- NULL
    # require(reticulate)
    # source_python('./python_svd.py')
    reticulate::source_python(system.file("python/python_svd.py", package = "APL"))
    
    SVD <- svd_torch(S)
    # SVD <- svd_linalg_torch(S)
    names(SVD) <- c("U", "D", "V")
    SVD$D <- as.vector(SVD$D)

  } else {

    SVD <- svd(S, nu = dims, nv = dims)
    names(SVD) <- c("D", "U", "V")
    SVD <- SVD[c(2, 1, 3)]
    if(length(SVD$D) > dims) SVD$D <- SVD$D[seq_len(dims)]
  }

  names(SVD$D) <- paste0("Dim", seq_len(length(SVD$D)))
  dimnames(SVD$V) <- list(colnames(S), paste0("Dim", seq_len(ncol(SVD$V))))
  dimnames(SVD$U) <- list(rownames(S), paste0("Dim", seq_len(ncol(SVD$U))))


  if(inertia == TRUE){
    #calculate inertia
    SVD$tot_inertia <- sum(SVD$D^2)
    SVD$row_inertia <- Matrix::rowSums(S^2)
    SVD$col_inertia <- Matrix::colSums(S^2)
  }

  SVD$row_masses <- rowm
  SVD$col_masses <- colm
  SVD$top_rows <- toptmp

  SVD <- do.call(new_cacomp, SVD)
  SVD <- subset_dims(SVD, dims)
  # class(SVD) <- "cacomp"

  if (coords == TRUE){
    # message("Calculating coordinates...")

    SVD <- ca_coords(caobj = SVD,
                     dims = dims,
                     princ_coords = princ_coords,
                     princ_only = FALSE)
  } else {
    if(!is.null(dims)){
      if (dims >= length(SVD@D)){
        if (dims > length(SVD@D)){
          warning("Chosen number of dimensions is larger than the ",
                  "number of dimensions obtained from the singular ",
                  "value decomposition. Argument ignored.")
        }
        SVD@dims <- length(SVD@D)
      } else {
        dims <- min(dims, length(SVD@D))
        SVD@dims <- dims

        dims <- seq(dims)

        # subset to number of dimensions
        SVD@U <- SVD@U[,dims]
        SVD@V <- SVD@V[,dims]
        SVD@D <- SVD@D[dims]
      }
    } else {
      SVD@dims <- length(SVD@D)
    }
  }

  stopifnot(validObject(SVD))
  return(SVD)
}


#' Correspondance Analysis
#'
#' @description
#' `cacomp` performs correspondence analysis on a matrix or
#' Seurat/SingleCellExperiment object and returns the transformed data.
#'
#' @details
#' The calculation is performed according to the work of Michael Greenacre. 
#' Singular value decomposition can be performed either with the base R 
#' function `svd` or preferably by the faster pytorch implementation 
#' (python = TRUE). When working with large matrices, CA coordinates and
#' principal coordinates should only be computed when needed to save 
#' computational time.
#'
#' @return
#' Returns a named list of class "cacomp" with components
#' U, V and D: The results from the SVD.
#' row_masses and col_masses: Row and columns masses.
#' top_rows: How many of the most variable rows were retained for the analysis.
#' tot_inertia, row_inertia and col_inertia: Only if inertia = TRUE.
#' Total, row and column inertia respectively.
#' @references
#' Greenacre, M. Correspondence Analysis in Practice, Third Edition, 2017.

#' @param obj A numeric matrix or Seurat/SingleCellExperiment object.
#' For sequencing a count matrix, gene expression values with genes in rows 
#' and samples/cells in columns.
#' Should contain row and column names.
#' @param coords Logical. Indicates whether CA standard coordinates should be 
#' calculated. Default TRUE
#' @param python A logical value indicating whether to use singular-value 
#' decomposition from the python package torch.
#' This implementation dramatically speeds up computation compared to `svd()` 
#' in R.
#' @param princ_coords Integer. Number indicating whether principal 
#' coordinates should be calculated for the rows (=1), columns (=2), 
#' both (=3) or none (=0).
#' Default 1.
#' @param dims Integer. Number of CA dimensions to retain. Default NULL 
#' (keeps all dimensions).
#' @param top Integer. Number of most variable rows to retain. 
#' Default 5000. (set NULL to keep all).
#' @param inertia Logical.. Whether total, row and column inertias should be 
#' calculated and returned. Default TRUE.
#' @param rm_zeros Logical. Whether rows & cols containing only 0s should be 
#' removed.
#' Keeping zero only rows/cols might lead to unexpected results. Default TRUE.
#' @param ... Arguments forwarded to methods.
#' @examples
#' # Simulate scRNAseq data.
#' cnts <- data.frame(cell_1 = rpois(10, 5),
#'                    cell_2 = rpois(10, 10),
#'                    cell_3 = rpois(10, 20))
#' rownames(cnts) <- paste0("gene_", 1:10)
#' cnts <- as.matrix(cnts)
#'
#' # Run correspondence analysis.
#' ca <- cacomp(obj = cnts, princ_coords = 3, top = 5)
#' @export
setGeneric("cacomp", function(obj,
                              coords = TRUE,
                              princ_coords = 3,
                              python = FALSE,
                              dims = NULL,
                              top = 5000,
                              inertia = TRUE,
                              rm_zeros = TRUE,
                              ...) {
  standardGeneric("cacomp")
})


#' @rdname cacomp
#' @export
setMethod(f = "cacomp",
          signature=(obj="matrix"),
          function(obj,
                   coords = TRUE,
                   princ_coords = 3,
                   python = FALSE,
                   dims = NULL,
                   top = 5000,
                   inertia = TRUE,
                   rm_zeros = TRUE,
                   ...){

    caobj <- run_cacomp(obj = obj,
                        coords = coords,
                        princ_coords = princ_coords,
                        python = python,
                        dims = dims,
                        top = top,
                        inertia = inertia,
                        rm_zeros = rm_zeros,
                        ...)

    return(caobj)

})

#' @rdname cacomp
#' @export
setMethod(f = "cacomp",
          signature=(obj="dgCMatrix"),
          function(obj,
                   coords = TRUE,
                   princ_coords = 3,
                   python = FALSE,
                   dims = NULL,
                   top = 5000,
                   inertia = TRUE,
                   rm_zeros = TRUE,
                   ...){

    caobj <- run_cacomp(obj = obj,
                        coords = coords,
                        princ_coords = princ_coords,
                        python = python,
                        dims = dims,
                        top = top,
                        inertia = inertia,
                        rm_zeros = rm_zeros,
                        ...)

    return(caobj)

})


#' Correspondance Analysis for Seurat objects
#'
#' @description
#' `cacomp.seurat` performs correspondence analysis on an assay from a Seurat 
#' container and stores the standardized coordinates of the columns (= cells) 
#' and the principal coordinates of the rows (= genes) as a DimReduc Object in 
#' the Seurat container.
#'
#' @return
#' If return_imput = TRUE with Seurat container: Returns input obj of class 
#' "Seurat" with a new Dimensional Reduction Object named "CA".
#' Standard coordinates of the cells are saved as embeddings,
#' the principal coordinates of the genes as loadings and
#' the singular values (= square root of principal intertias/eigenvalues)
#' are stored as stdev.
#' To recompute a regular "cacomp" object without rerunning cacomp use 
#' `as.cacomp()`.
#' @param assay Character. The assay from which extract the count matrix for 
#' SVD, e.g. "RNA" for Seurat objects or "counts"/"logcounts" for 
#' SingleCellExperiments.
#' @param slot character. The slot of the Seurat assay. Default "counts".
#' @param return_input Logical. If TRUE returns the input 
#' (SingleCellExperiment/Seurat object) with the CA results saved in the 
#' reducedDim/DimReduc slot "CA".
#' Otherwise returns a "cacomp". Default FALSE.
#' @param ... Other parameters
#' @rdname cacomp
#' @export
#' @examples
#' 
#' ###########
#' # Seurat  #
#' ###########
#' library(Seurat)
#' set.seed(1234)
#'
#' # Simulate counts
#' cnts <- mapply(function(x){rpois(n = 500, lambda = x)},
#'                      x = sample(1:20, 50, replace = TRUE))
#' rownames(cnts) <- paste0("gene_", 1:nrow(cnts))
#' colnames(cnts) <- paste0("cell_", 1:ncol(cnts))
#'
#' # Create Seurat object
#' seu <- CreateSeuratObject(counts = cnts)
#'
#' # Run CA and save in dim. reduction slot
#' seu <- cacomp(seu, return_input = TRUE, assay = "RNA", slot = "counts")
#'
#' # Run CA and return cacomp object
#' ca <- cacomp(seu, return_input = FALSE, assay = "RNA", slot = "counts")
setMethod(f = "cacomp",
          signature=(obj="Seurat"),
          function(obj,
                   coords = TRUE,
                   princ_coords = 3,
                   python = FALSE,
                   dims = NULL,
                   top = 5000,
                   inertia = TRUE,
                   rm_zeros = TRUE,
                   ...,
                   assay = Seurat::DefaultAssay(obj),
                   slot = "counts",
                   return_input = FALSE){

  stopifnot("obj doesnt belong to class 'Seurat'" = is(obj, "Seurat"))

  stopifnot("Set coords = TRUE when inputting a Seurat object and return_input = TRUE." = coords == TRUE)


  seu <- Seurat::GetAssayData(object = obj, assay = assay, slot = slot)
  seu <- as.matrix(seu)

  caobj <- run_cacomp(obj = seu,
                      coords = coords,
                      top = top,
                      princ_coords = princ_coords,
                      dims = dims,
                      python = python,
                      rm_zeros = rm_zeros,
                      inertia = inertia,
                      ...)

  if (return_input == TRUE){
    colnames(caobj@V) <- paste0("DIM_", seq(ncol(caobj@V)))
    colnames(caobj@U) <- paste0("DIM_", seq(ncol(caobj@U)))

    obj[["CA"]] <- Seurat::CreateDimReducObject(embeddings = caobj@std_coords_cols,
                                               loadings = caobj@prin_coords_rows,
                                               stdev = caobj@D,
                                               key = "Dim_",
                                               assay = assay)

    return(obj)
  } else {
    return(caobj)
  }

})


#' Correspondance Analysis for SingleCellExperiment objects
#'
#' @description
#' `cacomp.SingleCellExperiment` performs correspondence analysis on an assay 
#' from a SingleCellExperiment and stores the standardized coordinates
#'  of the columns (= cells) and the principal coordinates of the rows 
#'  (= genes) as a matrix in the SingleCellExperiment container.
#'
#' @return
#' If return_input =TRUE for SingleCellExperiment input returns a 
#' SingleCellExperiment object with a matrix of standardized coordinates of 
#' the columns in
#' reducedDim(obj, "CA"). Additionally, the matrix contains the following 
#' attributes:
#' "prin_coords_rows": Principal coordinates of the rows.
#' "singval": Singular values. For the explained inertia of each principal 
#' axis calculate singval^2.
#' "percInertia": Percent explained inertia of each principal axis.
#' To recompute a regular "cacomp" object from a SingleCellExperiment without 
#' rerunning cacomp use `as.cacomp()`.
#' @param assay Character. The assay from which extract the count matrix for 
#' SVD, e.g. "RNA" for Seurat objects or "counts"/"logcounts" for 
#' SingleCellExperiments.
#' @param return_input Logical. If TRUE returns the input 
#' (SingleCellExperiment/Seurat object) with the CA results saved in the 
#' reducedDim/DimReduc slot "CA".
#'  Otherwise returns a "cacomp". Default FALSE.
#' @rdname cacomp
#' @export
#' @examples
#' 
#' ########################
#' # SingleCellExperiment #
#' ########################
#' library(SingleCellExperiment)
#' set.seed(1234)
#'
#' # Simulate counts
#' cnts <- mapply(function(x){rpois(n = 500, lambda = x)},
#'                x = sample(1:20, 50, replace = TRUE))
#' rownames(cnts) <- paste0("gene_", 1:nrow(cnts))
#' colnames(cnts) <- paste0("cell_", 1:ncol(cnts))
#' logcnts <- log2(cnts + 1)
#'
#' # Create SingleCellExperiment object
#' sce <- SingleCellExperiment(assays=list(counts=cnts, logcounts=logcnts))
#'
#' # run CA and save in dim. reduction slot.
#' sce <- cacomp(sce, return_input = TRUE, assay = "counts") # on counts
#' sce <- cacomp(sce, return_input = TRUE, assay = "logcounts") # on logcounts
#'
#' # run CA and return cacomp object.
#' ca <- cacomp(sce, return_input = FALSE, assay = "counts")
setMethod(f = "cacomp",
          signature=(obj="SingleCellExperiment"),
          function(obj,
                   coords = TRUE,
                   princ_coords = 3,
                   python = FALSE,
                   dims = NULL,
                   top = 5000,
                   inertia = TRUE,
                   rm_zeros = TRUE,
                   ...,
                   assay = "counts",
                   return_input = FALSE){

  stopifnot("obj doesnt belong to class 'SingleCellExperiment'" = is(obj, "SingleCellExperiment"))
  stopifnot("Set coords = TRUE when inputting a SingleCellExperiment object and return_input = TRUE." = coords == TRUE)

  mat <- SummarizedExperiment::assay(obj, assay)
  mat <- as.matrix(mat)

  top <- min(nrow(mat), top)

  caobj <- run_cacomp(obj = mat,
                     coords = coords,
                     top = top,
                     princ_coords = princ_coords,
                     dims = dims,
                     python = python,
                     rm_zeros = rm_zeros,
                     inertia = inertia,
                     ...)

  if (return_input == TRUE){
    prinInertia <- caobj@D^2
    percentInertia <- prinInertia / sum(prinInertia) * 100

    # Saving the results
    ca <- caobj@std_coords_cols
    attr(ca, "prin_coords_rows") <- caobj@prin_coords_rows
    attr(ca, "singval") <- caobj@D
    attr(ca, "percInertia") <- percentInertia

    SingleCellExperiment::reducedDim(obj, "CA") <- ca

    return(obj)

  } else {
    return(caobj)
  }

})


#' Subset dimensions of a caobj
#'
#' @description Subsets the dimensions according to user input.
#'
#' @return Returns caobj.
#'
#' @param caobj A caobj.
#' @param dims Integer. Number of dimensions.
#' @examples
#' # Simulate scRNAseq data.
#' cnts <- data.frame(cell_1 = rpois(10, 5),
#'                    cell_2 = rpois(10, 10),
#'                    cell_3 = rpois(10, 20))
#' rownames(cnts) <- paste0("gene_", 1:10)
#' cnts <- as.matrix(cnts)
#'
#' # Run correspondence analysis.
#' ca <- cacomp(cnts)
#' ca <- subset_dims(ca, 2)
#' @export
subset_dims <- function(caobj, dims){
  
  if (dims == 1){stop("Please choose more than 1 dimension.")}
  
  stopifnot(is(caobj, "cacomp"))
  
  if (is.null(dims)){
    return(caobj)
  }
  
  if(dims > length(caobj@D)){
    warning("dims is larger than the number of available dimensions.",
            " Argument ignored")
  } else if (dims == length(caobj@D)){
    caobj@dims <- dims
    return(caobj)
  }

  dims <- min(dims, length(caobj@D))
  caobj@dims <- dims
  dims <- seq(dims)
  caobj@U <- caobj@U[,dims]
  caobj@V <- caobj@V[,dims]
  caobj@D <- caobj@D[dims]

  if (!is.empty(caobj@std_coords_cols)){
    caobj@std_coords_cols <- caobj@std_coords_cols[,dims]
  }
  if (!is.empty(caobj@prin_coords_cols)){
    caobj@prin_coords_cols <- caobj@prin_coords_cols[,dims]
  }

  if (!is.empty(caobj@std_coords_rows)){
    caobj@std_coords_rows <- caobj@std_coords_rows[,dims]
  }
  if (!is.empty(caobj@prin_coords_rows)){
    caobj@prin_coords_rows <- caobj@prin_coords_rows[,dims]
  }

  stopifnot(validObject(caobj))
  return(caobj)
}


#' Calculate correspondence analysis row and column coordinates.
#'
#' @description `ca_coords` calculates the standardized and principal 
#' coordinates of the rows and columns in CA space.
#'
#' @details
#' Takes a "cacomp" object and calculates standardized and principal 
#' coordinates for the visualization of CA results in a biplot or
#' to subsequently calculate coordinates in an Association Plot.
#'
#' @return
#' Returns input object with coordinates added.
#' std_coords_rows/std_coords_cols: Standardized coordinates of rows/columns.
#' prin_coords_rows/prin_coords_cols: Principal coordinates of rows/columns.
#'
#' @param caobj A "cacomp" object as outputted from `cacomp()`.
#' @param dims Integer indicating the number of dimensions to use for the 
#' calculation of coordinates.
#' All elements of caobj (where applicable) will be reduced to the given 
#' number of dimensions. Default NULL (keeps all dimensions).
#' @param princ_only Logical, whether only principal coordinates should be 
#' calculated.
#' Or, in other words, whether the standardized coordinates are already 
#' calculated and stored in `caobj`. Default `FALSE`.
#' @param princ_coords Integer. Number indicating whether principal 
#' coordinates should be calculated for the rows (=1), columns (=2), both (=3) 
#' or none (=0).
#' Default 3.
#' @examples
#' # Simulate scRNAseq data.
#' cnts <- data.frame(cell_1 = rpois(10, 5),
#'                    cell_2 = rpois(10, 10),
#'                    cell_3 = rpois(10, 20))
#' rownames(cnts) <- paste0("gene_", 1:10)
#' cnts <- as.matrix(cnts)
#'
#' # Run correspondence analysis.
#' ca <- cacomp(obj = cnts, princ_coords = 1)
#' ca <- ca_coords(ca, princ_coords = 3)
#' @export
ca_coords <- function(caobj, dims=NULL, princ_coords = 3, princ_only = FALSE){

  stopifnot(is(caobj, "cacomp"))
  stopifnot(dims <= length(caobj@D))

  if(!is.null(dims)){
    if (dims > length(caobj@D)){
      warning("Chosen dimensions are larger than the number of ",
              "dimensions obtained from the singular value ",
              "decomposition. Argument ignored.")
     }
      caobj <- subset_dims(caobj = caobj, dims = dims)
    }


  if(princ_only == FALSE){

    #standard coordinates
    if(dims == 1 && !is.null(dims)){
      caobj@std_coords_rows <- caobj@U/sqrt(caobj@row_masses)
      caobj@std_coords_cols <- caobj@V/sqrt(caobj@col_masses)
    } else {
      caobj@std_coords_rows <- sweep(x = caobj@U,
                                     MARGIN = 1,
                                     STATS = sqrt(caobj@row_masses),
                                     FUN = "/")
      caobj@std_coords_cols <- sweep(x = caobj@V,
                                     MARGIN = 1,
                                     STATS = sqrt(caobj@col_masses),
                                     FUN = "/")
    }


    # Ensure no NA/Inf after dividing by 0.
    caobj@std_coords_rows[is.na(caobj@std_coords_rows)] <- 0
    caobj@std_coords_cols[is.na(caobj@std_coords_cols)] <- 0
    caobj@std_coords_rows[is.infinite(caobj@std_coords_rows)] <- 0
    caobj@std_coords_cols[is.infinite(caobj@std_coords_cols)] <- 0

  }


  stopifnot("princ_coords must be either 0, 1, 2 or 3" = 
              (princ_coords == 0 || 
               princ_coords == 1 || 
               princ_coords == 2 || 
               princ_coords == 3))

  if(princ_coords != 0){
    stopifnot(!is.empty(caobj@std_coords_rows))
    stopifnot(!is.empty(caobj@std_coords_cols))

      if (princ_coords == 1){
        #principal coordinates for rows
        if (dims == 1 && !is.null(dims)){
          caobj@prin_coords_rows <- caobj@std_coords_rows*caobj@D
        } else {
          caobj@prin_coords_rows <- sweep(caobj@std_coords_rows,
                                          2,
                                          caobj@D,
                                          "*")
        }

      } else if (princ_coords == 2) {
        #principal coordinates for columns
        if (dims == 1 && !is.null(dims)){
          caobj@prin_coords_cols <- caobj@std_coords_cols*caobj@D
        } else {
          caobj@prin_coords_cols <- sweep(caobj@std_coords_cols,
                                          2,
                                          caobj@D,
                                          "*")
        }
      } else if (princ_coords  == 3) {

        if (dims == 1 && !is.null(dims)){
          #principal coordinates for rows
          caobj@prin_coords_rows <- caobj@std_coords_rows*caobj@D
          #principal coordinates for columns
          caobj@prin_coords_cols <- caobj@std_coords_cols*caobj@D
        } else {
          #principal coordinates for rows
          caobj@prin_coords_rows <- sweep(caobj@std_coords_rows,
                                          2,
                                          caobj@D,
                                          "*")
          #principal coordinates for columns
          caobj@prin_coords_cols <- sweep(caobj@std_coords_cols,
                                          2,
                                          caobj@D,
                                          "*")
        }

      }

  }

  stopifnot(validObject(caobj))
  return(caobj)
}


#' Scree Plot
#'
#'@description Plots a scree plot.
#'
#'@return
#'Returns a ggplot object.
#'
#'@param df A data frame with columns "dims" and "inertia".
scree_plot <- function(df){

  stopifnot(c("dims", "inertia") %in% colnames(df))

  avg_inertia <- 100/nrow(df)
  max_num_dims <- nrow(df)

  screeplot <- ggplot2::ggplot(df, ggplot2::aes(x=.data$dims,
                                                y=.data$inertia)) +
    ggplot2::geom_col(fill="#4169E1") +
    ggplot2::geom_line(color="#B22222", size=1) +
    ggplot2::labs(
         title = "Scree plot of explained inertia per dimensions and the average inertia",
         y="Explained inertia [%]",
         x="Dimension") +
    ggplot2::theme_bw()
  return(screeplot)
}

#' Runs elbow method
#' 
#' @description Helper function for pick_dims() to run the elbow method.
#' 
#' @param obj A "cacomp" object as outputted from `cacomp()`
#' @param mat A numeric matrix. For sequencing a count matrix, gene expression 
#' values with genes in rows and samples/cells in columns.
#' Should contain row and column names.
#' @param reps Integer. Number of permutations to perform when choosing 
#' "elbow_rule".
#' @param return_plot TRUE/FALSE. Whether a plot should be returned when 
#' choosing "elbow_rule".
#' @param python A logical value indicating whether to use singular value 
#' decomposition from the python package torch.
#' This implementation dramatically speeds up computation compared to `svd()` 
#' in R.
#' @return
#' `elbow_method` (for `return_plot=TRUE`) returns a list with two elements: 
#' "dims" contains the number of dimensions and "plot" a ggplot. if 
#' `return_plot=TRUE` it just returns the number of picked dimensions.
#' @references 
#' Ciampi, Antonio, González Marcos, Ana and Castejón Limas, Manuel. \cr
#' Correspondence analysis and 2-way clustering. (2005), SORT 29(1).
#' 
#' @examples 
#' 
#' # Get example data from Seurat
#' library(Seurat)
#' set.seed(2358)
#' cnts <- as.matrix(Seurat::GetAssayData(pbmc_small,
#'                                        assay = "RNA",
#'                                        slot = "data"))
#' # Run correspondence analysis.
#' ca <- cacomp(obj = cnts)
#' 
#' # pick dimensions with the elbow rule. Returns list.
#' pd <- pick_dims(obj = ca,
#'                 mat = cnts,
#'                 method = "elbow_rule",
#'                 return_plot = TRUE,
#'                 reps = 10)
#' pd$plot
#' ca_sub <- subset_dims(ca, dims = pd$dims)
#' 
elbow_method <- function(obj,
                         mat,
                         reps,
                         python = FALSE,
                         return_plot = FALSE){
  ev <- obj@D^2
  expl_inertia <- (ev/sum(ev)) *100
  max_num_dims <- length(obj@D)
  
  matrix_expl_inertia_perm <- matrix(0, nrow = max_num_dims , ncol = reps)
  
  pb <- txtProgressBar(min = 0, max = reps, style = 3)
  
  for (k in seq(reps)) {
    
    mat <- as.matrix(mat)
    mat_perm <- apply(mat, 2, FUN=sample)
    colnames(mat_perm) <- colnames(mat)
    rownames(mat_perm) <- seq_len(nrow(mat_perm))
    
    obj_perm <- cacomp(obj=mat_perm,
                       top = obj@top_rows,
                       dims = obj@dims,
                       coords = FALSE,
                       python = python)
    
    ev_perm <- obj_perm@D^2
    expl_inertia_perm <- (ev_perm/sum(ev_perm))*100
    
    matrix_expl_inertia_perm[,k] <- expl_inertia_perm
    colnames(matrix_expl_inertia_perm) <- paste0("perm",seq_len(reps))
    
    setTxtProgressBar(pb, k)
    
  }
  close(pb)
  
  
  if (return_plot == TRUE){
    df <- data.frame(dims = seq_len(max_num_dims),
                     inertia = expl_inertia)
    
    df <- cbind(df, matrix_expl_inertia_perm)
    
    screeplot <- scree_plot(df)
    
    for (k in seq_len(reps)) {
      
      colnm <- ggplot2::sym(paste0("perm",k))
      
      screeplot <- screeplot +
        ggplot2::geom_line(data = df, ggplot2::aes(x=.data$dims,
                                                   y=!!colnm),
                           color="black",
                           alpha=0.8,
                           linetype=2)
      
    }
  }
  
  avg_inertia_perm <- rowMeans(matrix_expl_inertia_perm)
  
  tmp <- as.integer(expl_inertia>avg_inertia_perm)
  if (sum(tmp)==0 || sum(tmp)==max_num_dims){
    dim_number <- max_num_dims								
  } else if (tmp[1] == 0){
    stop("Average inertia of the permutated data is above ",
         "the explained inertia of the data in the first dimension. ",
         "Please either try more permutations or a different method.")
  }else{
    dim_number <- length(tmp[cumsum(tmp == 0)<1 & tmp!=0])		
  }
  
  if (return_plot == FALSE){
    return(dim_number)
  } else {
    return(list("dims" = dim_number, "plot" = screeplot))
  }
}


#' Compute statistics to help choose the number of dimensions
#'
#' @description
#' Allow the user to choose from 4 different methods ("avg_inertia", 
#' "maj_inertia", "scree_plot" and "elbow_rule")
#' to estimate the number of dimensions that best represent the data.
#'
#' @details
#' * "avg_inertia" calculates the number of dimensions in which the inertia is 
#' above the average inertia.
#' * "maj_inertia" calculates the number of dimensions in which cumulatively 
#' explain up to 80% of the total inertia.
#' * "scree_plot" plots a scree plot.
#' * "elbow_rule" formalization of the commonly used elbow rule. Permutes the 
#' rows for each column and reruns `cacomp()` for a total of `reps` times.
#' The number of relevant dimensions is obtained from the point where the 
#' line for the explained inertia of the permuted data intersects with the 
#' actual data.
#'
#' @return
#' For `avg_inertia`, `maj_inertia` and `elbow_rule` (when `return_plot=FALSE`) 
#' returns an integer, indicating the suggested number of dimensions to use.
#' * `scree_plot` returns a ggplot object.
#' * `elbow_rule` (for `return_plot=TRUE`) returns a list with two elements: 
#' "dims" contains the number of dimensions and "plot" a ggplot.
#'
#' @param obj A "cacomp" object as outputted from `cacomp()`,
#' a "Seurat" object with a "CA" DimReduc object stored,
#' or a "SingleCellExperiment" object with a "CA" dim. reduction stored.
#' @param mat A numeric matrix. For sequencing a count matrix, gene expression 
#' values with genes in rows and samples/cells in columns.
#' Should contain row and column names.
#' @param method String. Either "scree_plot", "avg_inertia", "maj_inertia" or 
#' "elbow_rule" (see Details section). Default "scree_plot".
#' @param reps Integer. Number of permutations to perform when choosing 
#' "elbow_rule". Default 3.
#' @param return_plot TRUE/FALSE. Whether a plot should be returned when 
#' choosing "elbow_rule". Default FALSE.
#' @param python A logical value indicating whether to use singular value 
#' decomposition from the python package torch.
#' This implementation dramatically speeds up computation compared to `svd()` 
#' in R.
#' @param ... Arguments forwarded to methods.
#' @examples
#' # Simulate counts
#' cnts <- mapply(function(x){rpois(n = 500, lambda = x)},
#'                x = sample(1:20, 50, replace = TRUE))
#' rownames(cnts) <- paste0("gene_", 1:nrow(cnts))
#' colnames(cnts) <- paste0("cell_", 1:ncol(cnts))
#'
#' # Run correspondence analysis.
#' ca <- cacomp(obj = cnts)
#'
#' # pick dimensions with the elbow rule. Returns list.
#'
#' set.seed(2358)
#' pd <- pick_dims(obj = ca,
#'                 mat = cnts,
#'                 method = "elbow_rule",
#'                 return_plot = TRUE,
#'                 reps = 10)
#' pd$plot
#' ca_sub <- subset_dims(ca, dims = pd$dims)
#'
#' # pick dimensions which explain cumulatively >80% of total inertia.
#' # Returns vector.
#' pd <- pick_dims(obj = ca,
#'                 method = "maj_inertia")
#' ca_sub <- subset_dims(ca, dims = pd)
#' @export
#' @md
setGeneric("pick_dims", function(obj,
                                 mat = NULL,
                                 method="scree_plot",
                                 reps=3,
                                 python = FALSE,
                                 return_plot = FALSE,
                                 ...) {
  standardGeneric("pick_dims")
})


#' @rdname pick_dims
#' @export
setMethod(f = "pick_dims",
          signature=(obj="cacomp"),
          function(obj,
                   mat = NULL,
                   method="scree_plot",
                   reps=3,
                   python = FALSE,
                   return_plot = FALSE,
                   ...){

  if (!is(obj,"cacomp")){
    stop("Not a CA object. Please run cacomp() first!")
  }

  ev <- obj@D^2
  expl_inertia <- (ev/sum(ev)) *100
  max_num_dims <- length(obj@D)

  if (method == "avg_inertia"){
    # Method 1: Dim's > average inertia
    # percentage of inertia explained by 1 dimension (on average)
    avg_inertia <- 100/max_num_dims
    # result: number of dimensions, all of which explain more than avg_inertia
    dim_num <- sum(expl_inertia > avg_inertia)  
    return(dim_num)

  } else if (method == "maj_inertia"){
    # Method 2: Sum of dim's > 80% of the total inertia
    # the first dimension for which the cumulative sum of inertia (from dim1 
    # up to given dimension) is higher than 80%
    dim_num <- min(which(cumsum(expl_inertia)>80)) 
    return(dim_num)

  } else if (method == "scree_plot"){
    # Method 3: Graphical representation of explained inertia (scree plot)
    # the user can set the threshold based on the scree plot

    df <- data.frame(dims = seq_len(max_num_dims),
                     inertia = expl_inertia)

    screeplot <- scree_plot(df)

    return(screeplot)

  } else if (method == "elbow_rule") {

    if(is.null(mat)){
      cat(paste0("When running method=\"elbow_rule\", ",
                 "please provide the original data matrix (paramater mat) ",
                 "which was earlier submitted to cacomp()!"))
      stop()
    }
    
    pd <- elbow_method(obj = obj,
                       mat = mat,
                       reps = reps,
                       python = python,
                       return_plot = return_plot)
    return(pd)
    
  } else {
    cat("Please pick a valid method!")
    stop()
  }
})



#' @param assay Character. The assay from which extract the count matrix for 
#' SVD, e.g. "RNA" for Seurat objects or "counts"/"logcounts" for 
#' SingleCellExperiments.
#' @param slot Character. Data slot of the Seurat assay. 
#' E.g. "data" or "counts". Default "counts".
#'
#' @rdname pick_dims
#' @export
#' @examples
#'
#' ################################
#' # pick_dims for Seurat objects #
#' ################################
#' library(Seurat)
#' set.seed(1234)
#'
#' # Simulate counts
#' cnts <- mapply(function(x){rpois(n = 500, lambda = x)},
#'                x = sample(1:20, 50, replace = TRUE))
#' rownames(cnts) <- paste0("gene_", 1:nrow(cnts))
#' colnames(cnts) <- paste0("cell_", 1:ncol(cnts))
#'
#' # Create Seurat object
#' seu <- CreateSeuratObject(counts = cnts)
#'
#' # run CA and save in dim. reduction slot.
#' seu <- cacomp(seu, return_input = TRUE, assay = "RNA", slot = "counts")
#'
#' # pick dimensions
#' pd <- pick_dims(obj = seu,
#'                 method = "maj_inertia",
#'                 assay = "RNA",
#'                 slot = "counts")
setMethod(f = "pick_dims",
          signature=(obj="Seurat"),
          function(obj,
                   mat = NULL,
                   method="scree_plot",
                   reps=3,
                   python = FALSE,
                   return_plot = FALSE,
                   ...,
                   assay = Seurat::DefaultAssay(obj),
                   slot = "counts"){

  stopifnot("obj doesn't belong to class 'Seurat'" = is(obj, "Seurat"))

  if (method == "elbow_rule") {
    seu <- Seurat::GetAssayData(object = obj, assay = assay, slot = slot)
    seu <- as.matrix(seu)
  } else {
    seu <- NULL
  }

  if ("CA" %in% Seurat::Reductions(obj)){
    caobj <- as.cacomp(obj, assay = assay)
  } else {
    stop("No 'CA' dimension reduction object found. ",
         "Please run cacomp(seurat_obj, top, coords = FALSE, ",
         "return_input=TRUE) first.")
  }

  stopifnot(is(caobj, "cacomp"))

  pick_dims(obj = caobj,
            mat = seu,
            method = method,
            reps = reps,
            return_plot = return_plot,
            python = python)
})


#' @param assay Character. The assay from which to extract the count matrix 
#' for SVD, e.g. "RNA" for Seurat objects or "counts"/"logcounts" for 
#' SingleCellExperiments.
#'
#' @rdname pick_dims
#' @export
#' @examples
#'
#' ##############################################
#' # pick_dims for SingleCellExperiment objects #
#' ##############################################
#' library(SingleCellExperiment)
#' set.seed(1234)
#'
#' # Simulate counts
#' cnts <- mapply(function(x){rpois(n = 500, lambda = x)},
#'                x = sample(1:20, 50, replace = TRUE))
#' rownames(cnts) <- paste0("gene_", 1:nrow(cnts))
#' colnames(cnts) <- paste0("cell_", 1:ncol(cnts))
#'
#' # Create SingleCellExperiment object
#' sce <- SingleCellExperiment(assays=list(counts=cnts))
#'
#' # run CA and save in dim. reduction slot.
#' sce <- cacomp(sce, return_input = TRUE, assay = "counts")
#'
#' # pick dimensions
#' pd <- pick_dims(obj = sce,
#'                 method = "maj_inertia",
#'                 assay = "counts")
setMethod(f = "pick_dims",
          signature=(obj="SingleCellExperiment"),
          function(obj,
                   mat = NULL,
                   method="scree_plot",
                   reps=3,
                   python = FALSE,
                   return_plot = FALSE,
                   ...,
                   assay = "counts"){

  stopifnot("obj doesn't belong to class 'SingleCellExperiment'" = 
              is(obj, "SingleCellExperiment"))

  if (method == "elbow_rule") {
    mat <- SummarizedExperiment::assay(obj, assay)
  } else {
    mat <- NULL
  }

  if ("CA" %in% SingleCellExperiment::reducedDimNames(obj)){
    caobj <- as.cacomp(obj, assay = assay)
  } else {
    stop("No 'CA' dim. reduction object found. ",
         "Please run cacomp(sce, top, coords = FALSE, ",
         "return_input=TRUE) first.")
  }

  stopifnot(is(caobj, "cacomp"))
  pick_dims(obj = caobj,
           mat = mat,
           method = method,
           reps = reps,
           return_plot = return_plot,
           python = python)

})
ClemensKohl/APL documentation built on May 25, 2024, 3:15 p.m.