R/biotmle.R
In biotmle: Targeted Learning with Moderated Statistics for Biomarker Discovery

Documented in biomarkertmle exp_biomarkertmle

utils::globalVariables(c("assay<-"))

#' Biomarker Evaluation with Targeted Minimum Loss Estimation of the ATE
#'
#' Computes the causal target parameter defined as the difference between the
#' biomarker expression values under treatment and those same values under no
#' treatment, using Targeted Minimum Loss Estimation.
#'
#' @param se A \code{SummarizedExperiment} containing microarray expression
#'  or next-generation sequencing data in the \code{assays} slot and a matrix
#'  of phenotype-level data in the \code{colData} slot.
#' @param varInt A \code{numeric} indicating the column of the design matrix
#'  corresponding to the treatment or outcome of interest (in the
#'  \code{colData} slot of the \code{SummarizedExperiment} argument "se").
#' @param normalized A \code{logical} indicating whether the data included in
#'  the \code{assay} slot of the input \code{SummarizedExperiment} object has
#'  been normalized externally. The default is set to \code{TRUE} with the
#'  expectation that an appropriate normalization method has been applied. If
#'  set to \code{FALSE}, median normalization is performed for microarray data.
#' @param ngscounts A \code{logical} indicating whether the data are counts
#'  generated from a next-generation sequencing experiment (e.g., RNA-seq). The
#'  default setting assumes continuous expression measures as generated by
#'  microarray platforms.
#' @param parallel A \code{logical} of whether or not to use parallelization in
#'  the estimation procedure. Invoking parallelization happens through a
#'  combination of calls to \pkg{future} and \pkg{BiocParallel}. If this
#'  argument is set to \code{TRUE}, \code{\link[future]{multiprocess}} is used,
#'  and, if \code{FALSE}, \code{\link[future]{sequential}} is used, alongside
#'  \code{\link[BiocParallel]{bplapply}}. Other options for evaluation through
#'  futures may be invoked by setting the argument \code{future_param}.
#' @param bppar_type A \code{character} specifying the type of parallelization
#'  backend to be invoked by \code{BiocParallel}. Consult the manual page for
#'  \code{\link[BiocParallel]{BiocParallelParam}} for possible types and
#'  descriptions on their appropriate uses. The default for this argument is
#'  \code{NULL}, which silently uses \code{\link[BiocParallel]{DoparParam}}.
#' @param future_param A \code{character} specifying the parallelization type
#'  to be invoked when using futures for evaluation. For a list of available
#'  types, please consult the documentation for \code{\link[future]{plan}}. The
#'  default setting (this argument set to \code{NULL}) silently invokes
#'  \code{\link[future]{multiprocess}}.
#' @param bppar_debug A \code{logical} indicating whether or not to rely upon
#'  pkg{BiocParallel}. Setting this argument to \code{TRUE}, replaces the call
#'  to \code{\link[BiocParallel]{bplapply}} by a call to \code{lapply}, which
#'  significantly reduces the overhead of debugging. Note that invoking this
#'  option overrides all other parallelization arguments.
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
#'  performing targeted minimum loss estimation. Cross-validated estimates have
#'  been demonstrated to allow relaxation of certain theoretical conditions and
#'  and accommodate the construction of more conservative variance estimates.
#' @param g_lib A \code{character} vector specifying the library of machine
#'  learning algorithms for use in fitting the propensity score P(A = a | W).
#' @param Q_lib A \code{character} vector specifying the library of machine
#'  learning algorithms for use in fitting the outcome regression E[Y | A,W].
#' @param ... Additional arguments to be passed to \code{\link[drtmle]{drtmle}}
#'  in computing the targeted minimum loss estimator of the average treatment
#'  effect.
#'
#' @importFrom SummarizedExperiment assay colData rowData SummarizedExperiment
#' @importFrom BiocParallel register bplapply bpprogressbar DoparParam
#' @importFrom future plan multiprocess sequential
#' @importFrom doFuture registerDoFuture
#' @importFrom tibble as_tibble
#'
#' @return S4 object of class \code{biotmle}, inheriting from
#'  \code{SummarizedExperiment}, with additional slots \code{tmleOut} and
#'  \code{call}, among others, containing TML estimates of the ATE of exposure
#'  on biomarker expression.
#'
#' @export biomarkertmle
#'
#' @examples
#' library(dplyr)
#' library(biotmleData)
#' library(SuperLearner)
#' library(SummarizedExperiment)
#' data(illuminaData)
#'
#' colData(illuminaData) <- colData(illuminaData) %>%
#'   data.frame() %>%
#'   dplyr::mutate(age = as.numeric(age > median(age))) %>%
#'   DataFrame()
#' benz_idx <- which(names(colData(illuminaData)) %in% "benzene")
#'
#' biomarkerTMLEout <- biomarkertmle(
#'   se = illuminaData[1:2, ],
#'   varInt = benz_idx,
#'   parallel = FALSE,
#'   g_lib = c("SL.mean", "SL.glm"),
#'   Q_lib = c("SL.bayesglm", "SL.glm")
#' )
biomarkertmle <- function(se,
                          varInt,
                          normalized = TRUE,
                          ngscounts = FALSE,
                          parallel = TRUE,
                          bppar_type = NULL,
                          future_param = NULL,
                          bppar_debug = FALSE,
                          cv_folds = 1,
                          g_lib = c(
                            "SL.mean", "SL.glm", "SL.bayesglm", "SL.glmnet"
                          ),
                          Q_lib = c(
                            "SL.mean", "SL.bayesglm", "SL.earth", "SL.xgboost"
                          ),
                          ...) {

  # catch input and invoke S4 class constructor for "bioTMLE" object
  call <- match.call(expand.dots = TRUE)
  biotmle <- .biotmle(
    SummarizedExperiment(
      assays = list(expMeasures = assay(se)),
      rowData = rowData(se),
      colData = colData(se)
    ),
    call = call,
    tmleOut = tibble::as_tibble(matrix(NA, 10, 10)),
    topTable = tibble::as_tibble(matrix(NA, 10, 10))
  )

  # invoke the voom transform from LIMMA if next-generation sequencing data)
  if (ngscounts) {
    voom_out <- rnaseq_ic(biotmle)
    voom_exp <- 2^(voom_out$E)
    assay(se) <- voom_exp
  }

  # set up parallelization based on input
  doFuture::registerDoFuture()
  if (parallel == TRUE) {
    if (!is.null(future_param)) {
      set_future_param <- parse(text = paste0("future", "::", future_param))
      future::plan(eval(set_future_param))
    } else {
      future::plan(future::multiprocess)
    }
  } else if (parallel == FALSE) {
    message(paste(
      "Note: Sequential evaluation over many probes may take a long time."
    ))
    future::plan(future::sequential)
  }
  if (!is.null(bppar_type)) {
    bp_type <- eval(parse(text = paste0(
      "BiocParallel", "::",
      bppar_type, "()"
    )))
  } else {
    bp_type <- BiocParallel::DoparParam()
  }
  BiocParallel::bpprogressbar(bp_type) <- TRUE
  BiocParallel::register(bp_type, default = TRUE)

  # TMLE procedure to identify biomarkers based on an EXPOSURE
  if (!ngscounts && !normalized) {
    # median normalization
    exp_normed <- limma::normalizeBetweenArrays(as.matrix(assay(se)),
      method = "scale"
    )
    Y <- tibble::as_tibble(t(exp_normed))
  } else {
    Y <- tibble::as_tibble(t(as.matrix(assay(se))))
  }
  # simple sanity check of whether Y includes array values
  if (!all(apply(Y, 2, class) == "numeric")) {
    stop("Warning - values in Y do not appear to be numeric.")
  }

  # exposure / treatment
  A <- as.numeric(colData(se)[, varInt])

  # baseline covariates
  W <- tibble::as_tibble(as.data.frame(colData(se)[, -varInt]))
  if (dim(W)[2] == 0) {
    W <- as.numeric(rep(1, length(A)))
  }

  # coerce matrix of baseline covariates to numeric
  if (!all(apply(W, 2, class) == "numeric")) {
    W <- tibble::as_tibble(apply(W, 2, as.numeric))
  }

  # perform multi-level TMLE (of the ATE) for genes as Y
  if (!bppar_debug) {
    biomarkertmle_out <- BiocParallel::bplapply(Y[, seq_along(Y)],
      exp_biomarkertmle,
      W = W,
      A = A,
      g_lib = g_lib,
      Q_lib = Q_lib,
      cv_folds = cv_folds,
      ...
    )
  } else {
    biomarkertmle_out <- lapply(Y[, seq_along(Y)],
      exp_biomarkertmle,
      W = W,
      A = A,
      g_lib = g_lib,
      Q_lib = Q_lib,
      cv_folds = cv_folds,
      ...
    )
  }
  biomarkertmle_params <- do.call(c, lapply(biomarkertmle_out, `[[`, "param"))
  biomarkertmle_eifs <- do.call(
    cbind.data.frame,
    lapply(biomarkertmle_out, `[[`, "eif")
  )

  biotmle@ateOut <- as.numeric(biomarkertmle_params)
  if (!ngscounts) {
    biotmle@tmleOut <- tibble::as_tibble(t(as.matrix(biomarkertmle_eifs)))
  } else {
    voom_out$E <- t(as.matrix(biomarkertmle_eifs))
    biotmle@tmleOut <- voom_out
  }
  return(biotmle)
}

################################################################################

#' TMLE procedure using ATE for Biomarker Identication from Exposure
#'
#' This function performs influence curve-based estimation of the effect of an
#' exposure on biological expression values associated with a given biomarker,
#' controlling for a user-specified set of baseline covariates.
#'
#' @param Y A \code{numeric} vector of expression values for a given biomarker.
#' @param A A \code{numeric} vector of discretized exposure vector (e.g., from
#'  a design matrix whose effect on expression values is of interest.
#' @param W A \code{Matrix} of \code{numeric} values corresponding to baseline
#'  covariates to be marginalized over in the estimation process.
#' @param g_lib A \code{character} vector identifying the library of learning
#'  algorithms to be used in fitting the propensity score P[A = a | W].
#' @param Q_lib A \code{character} vector identifying the library of learning
#'  algorithms to be used in fitting the outcome regression E[Y | A, W].
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
#'  performing targeted minimum loss estimation. Cross-validated estimates are
#'  more robust, allowing relaxing of theoretical conditions and construction
#'  of conservative variance estimates.
#' @param ... Additional arguments passed to \code{\link[drtmle]{drtmle}} in
#'  computing the targeted minimum loss estimator of the average treatment
#'  effect.
#'
#' @importFrom assertthat assert_that
#' @importFrom drtmle drtmle
#'
#' @return TMLE-based estimate of the relationship between biomarker expression
#'  and changes in an exposure variable, computed iteratively and saved in the
#'  \code{tmleOut} slot in a \code{biotmle} object.
exp_biomarkertmle <- function(Y,
                              A,
                              W,
                              g_lib,
                              Q_lib,
                              cv_folds,
                              ...) {
  # check the case that Y is passed in as a column of a data.frame
  if (any(class(Y) == "data.frame")) Y <- as.numeric(unlist(Y[, 1]))
  if (any(class(A) == "data.frame")) A <- as.numeric(unlist(A[, 1]))
  assertthat::assert_that(length(unique(A)) > 1)

  # fit standard (possibly CV) TML estimator (n.b., guard = NULL)
  a_0 <- sort(unique(A[!is.na(A)]))
  suppressWarnings(
    tmle_fit <- drtmle::drtmle(
      Y = Y,
      A = A,
      W = W,
      a_0 = a_0,
      SL_g = g_lib,
      SL_Q = Q_lib,
      cvFolds = cv_folds,
      stratify = TRUE,
      guard = NULL,
      parallel = FALSE,
      use_future = FALSE,
      ...
    )
  )

  # compute ATE and estimated EIF by delta method
  ate_tmle <- tmle_fit$tmle$est[seq_along(a_0)[-1]] - tmle_fit$tmle$est[1]
  eif_tmle_delta <- tmle_fit$ic$ic[, seq_along(a_0)[-1]] - tmle_fit$ic$ic[, 1]

  # return only highest contrast (e.g., a[1] v a[5]) if many contrasts
  if (!is.vector(eif_tmle_delta)) {
    param_out <- ate_tmle[length(ate_tmle)]
    eif_out <- eif_tmle_delta[, ncol(eif_tmle_delta)] +
      ate_tmle[length(ate_tmle)]
  } else {
    param_out <- ate_tmle
    eif_out <- eif_tmle_delta + ate_tmle
  }
  assertthat::assert_that(is.vector(eif_out))

  # output
  out <- list(param = param_out, eif = eif_out)
  return(out)
}