R/scmet_hvf_lvf.R

Defines functions scmet_lvf scmet_hvf

Documented in scmet_hvf scmet_lvf

#' @name scmet_hvf
#' @rdname scmet_hvf_lvf
#' @aliases detect_hvf_lvf detect_hvf
#'
#' @title Detect highly (or lowly) variable features with scMET
#'
#' @description Function for calling features as highly (or lowly) variable
#'   within a datasert or cell population. This can be thought as a feature
#'   selection step, where the highly variable features (HVF) can be used for
#'   diverse downstream tasks, such as clustering or visualisation. Two
#'   approaches for identifying HVFs (or LVFs): (1) If we correct for
#'   mean-dispersion relationship, then we work directly on residual dispersions
#'   `epsilon`, and define a percentile threshold `delta_e`. This is the
#'   preferred option since the residual overdispersion is not confounded by
#'   mean methylation levels. (2) Work directly with the overdispersion
#'   parameter `gamma` and define an overdispersion contribution threshold
#'   `delta_g`, above (below) of which we call HVFs (LVFs).
#'
#' @param scmet_obj The scMET posterior object after performing inference, i.e.
#'   after calling `scmet` function.
#' @param delta_e Percentile threshold for residual overdispersion to detect
#'   variable features (between 0 and 1). Default: 0.9 for HVF and 0.1 for LVF
#'   (top 10%). NOTE: This parameter should be used when correcting for
#'   mean-dispersion relationship.
#' @param delta_g Overdispersion contribution threshold (between 0 and 1).
#' @param evidence_thresh Optional parameter. Posterior evidence probability
#'   threshold parameter `alpha_{H}` (between 0.6 and 1).
#' @param efdr Target for expected false discovery rate related to HVF/LVF
#'   detection (default = 0.1).
#'
#' @return The scMET posterior object with an additional element named `hvf` or
#'   `lvf` according to the analysis performed. This is a list object containing
#'   the following elements: \itemize{ \item{ \code{summary}: A data.frame
#'   containing HVF or LVF analysis output information per feature, including
#'   posterior medians for `mu`, `gamma`, and `epsilon`. The `tail_prob` column
#'   contains the posterior tail probability of a feature being called as HVF or
#'   LVF. The logical `is_variable` column informs whether the feature is called
#'   as variable or not.} \item{ \code{evidence_thresh}: The optimal evidence
#'   threshold. } \item{ \code{efdr}: The EFDR value.} \item{ \code{efnr}: The
#'   EFNR value. } \item{\code{efdr_grid}: The EFDR values for the grid search.}
#'   \item{\code{efnr_grid}: The EFNR values for the grid search.}
#'   \item{\code{evidence_thresh_grid}: The grid where we searched for optimal
#'   evidence threshold.} }
#'
#' @seealso \code{\link{scmet}}, \code{\link{scmet_differential}}
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' # Fit scMET
#' obj <- scmet(Y = scmet_dt$Y, X = scmet_dt$X, L = 4, iter = 100)
#'
#' # Run HVF analysis
#' obj <- scmet_hvf(scmet_obj = obj)
#'
#' # Run LVF analysis
#' obj <- scmet_lvf(scmet_obj = obj)
#'
#' @export
#'
scmet_hvf <- function(scmet_obj, delta_e = 0.9, delta_g = NULL,
                      evidence_thresh = 0.8, efdr = 0.1) {
  # Perform checks
  if (!(methods::is(scmet_obj, "scmet_mcmc") ||
        methods::is(scmet_obj, "scmet_vb")) ) {
    stop("Posterior object is not generated by scMET.")
  }
  if (!is.null(delta_e)) {
    assertthat::assert_that(delta_e >= 0 & delta_e <= 1)
  }
  if (!is.null(delta_g)) {
    assertthat::assert_that(delta_g >= 0 & delta_g <= 1)
  }
  if (!is.null(delta_g) & !is.null(delta_e) ) {
    stop("At least one of 'delta_e' or 'delta_g' should be NULL!")
  }
  if (!is.null(evidence_thresh)) {
    assertthat::assert_that(evidence_thresh >= 0.6 & evidence_thresh <= 1,
            msg = "Posterior evidence threshold must be between (0.6, 1).\n")
  } else {
    message("Posterior evidence threshold hasn't been supplied.\n",
            "Setting initial value to 0.8.\n")
    evidence_thresh <- 0.8
  }
  assertthat::assert_that(efdr >= 0 & efdr <= 1)

  # Extract predicted residual dispersion
  epsilon <- matrixStats::colMedians(scmet_obj$posterior$epsilon)
  # Extract dispersion parameter estimates
  gamma <- matrixStats::colMedians(scmet_obj$posterior$gamma)

  # If we want to compute HVF from residual dispersion
  if (!is.null(delta_e)) {
    eps_thresh <- stats::quantile(x = epsilon, probs = delta_e, na.rm = TRUE)
    # HVF probability for a given epsilon threshold, \pi_{j}
    prob <- matrixStats::colMeans2(scmet_obj$posterior$epsilon > eps_thresh)
  } else {# Compute HVF from dispersion parameter
    prob <- matrixStats::colMeans2(scmet_obj$posterior$gamma > delta_g)
  }

  # Threshold search and store optimal value
  tmp <- .thresh_search(evidence_thresh = evidence_thresh,
                        prob = prob[!is.na(prob)], efdr = efdr,
                        task = "HVF detection", suffix = "")
  optimal_evidence_thresh <- tmp$optimal_evidence_thresh

  # Output preparation
  mu <- matrixStats::colMedians(scmet_obj$posterior$mu)
  hvf <- ifelse(prob > optimal_evidence_thresh[1], TRUE, FALSE)

  feature_name <- scmet_obj$feature_names
  # Create table with data
  tbl <- cbind.data.frame(feature_name = feature_name,
                          mu = mu,
                          gamma = gamma,
                          epsilon = epsilon,
                          tail_prob = prob,
                          is_variable = hvf,
                          stringsAsFactors = FALSE)
  # Re-order the table of results
  tbl <- tbl[order(tbl$tail_prob, decreasing = TRUE, na.last = TRUE), ]

  # Show summary of results
  thresh_st <- ifelse(!is.null(delta_e),
                      paste("- The ", round(100 * delta_e, 2),
                            "percentile of variable genes \n"),
                      paste("- Overdispersion threshold =",
                            round(100 * delta_g, 2), "% \n"))
  message(sum(hvf, na.rm = TRUE),
          " Features classified as highly variable using: \n",
          thresh_st,
          "- Evidence threshold = ", optimal_evidence_thresh[1], "\n",
          "- EFDR = ", round(100 * optimal_evidence_thresh[2], 2), "% \n",
          "- EFNR = ", round(100 * optimal_evidence_thresh[3], 2), "% \n")
  # Store results
  scmet_obj$hvf <- list(summary = tbl,
                        evidence_thresh = optimal_evidence_thresh[1],
                        efdr = optimal_evidence_thresh[2],
                        efnr = optimal_evidence_thresh[3],
                        efdr_grid = tmp$efdr_grid,
                        efnr_grid = tmp$efnr_grid,
                        evidence_thresh_grid = tmp$evidence_thresh_grid,
                        target_efdr = efdr)

  return(scmet_obj)
}


#' @name scmet_lvf
#' @aliases scmet_hvf_lvf detect_lvf
#' @rdname scmet_hvf_lvf
#'
#' @export
#'
scmet_lvf <- function(scmet_obj, delta_e = 0.1, delta_g = NULL,
                      evidence_thresh = 0.8, efdr = 0.1) {
  # Perform checks
  if (!(methods::is(scmet_obj, "scmet_mcmc") ||
        methods::is(scmet_obj, "scmet_vb")) ) {
    stop("Posterior object is not generated from the given models.")
  }
  if (!is.null(delta_e)) {
    assertthat::assert_that(delta_e >= 0 & delta_e <= 1)
  }
  if (!is.null(delta_g)) {
    assertthat::assert_that(delta_g >= 0 & delta_g <= 1)
  }
  if (!is.null(delta_g) & !is.null(delta_e) ) {
    stop("At least one of 'delta_e' or 'delta_g' should be NULL!")
  }
  if (!is.null(evidence_thresh)) {
    assertthat::assert_that(evidence_thresh >= 0.6 & evidence_thresh <= 1,
          msg = "Posterior evidence threshold must be between (0.6, 1).\n.")
  } else {
    message("Posterior evidence threshold hasn't been supplied.\n",
            "Setting initial value to 0.8.\n")
    evidence_thresh <- 0.8
  }
  assertthat::assert_that(efdr >= 0 & efdr <= 1)

  # Extract predicted residual dispersion
  epsilon <- matrixStats::colMedians(scmet_obj$posterior$epsilon)
  # Extract dispersion parameter estimates
  gamma <- matrixStats::colMedians(scmet_obj$posterior$gamma)

  # Initialize variables
  epsilon = gamma <- rep(-1, length(scmet_obj$feature_names))
  # If we want to compute HVF from residual dispersion
  if (!is.null(delta_e)) {
    eps_thresh <- stats::quantile(x = epsilon, probs = delta_e, na.rm = TRUE)
    # HVF probability for a given epsilon threshold, \pi_{j}
    prob <- matrixStats::colMeans2(scmet_obj$posterior$epsilon < eps_thresh)
  } else {# Compute HVF from dispersion parameter
    prob <- matrixStats::colMeans2(scmet_obj$posterior$gamma < delta_g)
  }

  # Threshold search and store optimal value
  tmp <- .thresh_search(evidence_thresh = evidence_thresh,
                        prob = prob[!is.na(prob)], efdr = efdr,
                        task = "LVF detection", suffix = "")
  optimal_evidence_thresh <- tmp$optimal_evidence_thresh

  # Output preparation
  mu <- matrixStats::colMedians(scmet_obj$posterior$mu)
  lvf <- ifelse(prob > optimal_evidence_thresh[1], TRUE, FALSE)

  feature_name <- scmet_obj$feature_names
  # Create table with data
  tbl <- cbind.data.frame(feature_name = feature_name,
                          mu = mu,
                          gamma = gamma,
                          epsilon = epsilon,
                          tail_prob = prob,
                          is_variable = lvf,
                          stringsAsFactors = FALSE)
  # # Re-order the table of results
  tbl <- tbl[order(tbl$tail_prob, decreasing = TRUE, na.last = TRUE), ]

  # Show summary of results
  thresh_st <- ifelse(!is.null(delta_e),
                      paste("- The ", round(100 * delta_e, 2),
                            "percentile of variable genes \n"),
                      paste("- Overdispersion threshold =",
                            round(100 * delta_g, 2), "% \n"))
  message(sum(lvf, na.rm = TRUE),
          " Features classified as lowly variable using: \n",
          thresh_st,
          "- Evidence threshold = ", optimal_evidence_thresh[1], "\n",
          "- EFDR = ", round(100 * optimal_evidence_thresh[2], 2), "% \n",
          "- EFNR = ", round(100 * optimal_evidence_thresh[3], 2), "% \n")
  # Store results
  scmet_obj$lvf <- list(summary = tbl,
                        evidence_thresh = optimal_evidence_thresh[1],
                        efdr = optimal_evidence_thresh[2],
                        efnr = optimal_evidence_thresh[3],
                        efdr_grid = tmp$efdr_grid,
                        efnr_grid = tmp$efnr_grid,
                        evidence_thresh_grid = tmp$evidence_thresh_grid,
                        target_efdr = efdr)
  return(scmet_obj)
}
andreaskapou/scMET documentation built on Feb. 1, 2024, 10:46 a.m.