Nothing
utils::globalVariables(c("assay<-"))
#' Biomarker Evaluation with Targeted Minimum Loss Estimation of the ATE
#'
#' Computes the causal target parameter defined as the difference between the
#' biomarker expression values under treatment and those same values under no
#' treatment, using Targeted Minimum Loss Estimation.
#'
#' @param se A \code{SummarizedExperiment} containing microarray expression
#' or next-generation sequencing data in the \code{assays} slot and a matrix
#' of phenotype-level data in the \code{colData} slot.
#' @param varInt A \code{numeric} indicating the column of the design matrix
#' corresponding to the treatment or outcome of interest (in the
#' \code{colData} slot of the \code{SummarizedExperiment} argument "se").
#' @param normalized A \code{logical} indicating whether the data included in
#' the \code{assay} slot of the input \code{SummarizedExperiment} object has
#' been normalized externally. The default is set to \code{TRUE} with the
#' expectation that an appropriate normalization method has been applied. If
#' set to \code{FALSE}, median normalization is performed for microarray data.
#' @param ngscounts A \code{logical} indicating whether the data are counts
#' generated from a next-generation sequencing experiment (e.g., RNA-seq). The
#' default setting assumes continuous expression measures as generated by
#' microarray platforms.
#' @param parallel A \code{logical} of whether or not to use parallelization in
#' the estimation procedure. Invoking parallelization happens through a
#' combination of calls to \pkg{future} and \pkg{BiocParallel}. If this
#' argument is set to \code{TRUE}, \code{\link[future]{multiprocess}} is used,
#' and, if \code{FALSE}, \code{\link[future]{sequential}} is used, alongside
#' \code{\link[BiocParallel]{bplapply}}. Other options for evaluation through
#' futures may be invoked by setting the argument \code{future_param}.
#' @param bppar_type A \code{character} specifying the type of parallelization
#' backend to be invoked by \code{BiocParallel}. Consult the manual page for
#' \code{\link[BiocParallel]{BiocParallelParam}} for possible types and
#' descriptions on their appropriate uses. The default for this argument is
#' \code{NULL}, which silently uses \code{\link[BiocParallel]{DoparParam}}.
#' @param future_param A \code{character} specifying the parallelization type
#' to be invoked when using futures for evaluation. For a list of available
#' types, please consult the documentation for \code{\link[future]{plan}}. The
#' default setting (this argument set to \code{NULL}) silently invokes
#' \code{\link[future]{multiprocess}}.
#' @param bppar_debug A \code{logical} indicating whether or not to rely upon
#' pkg{BiocParallel}. Setting this argument to \code{TRUE}, replaces the call
#' to \code{\link[BiocParallel]{bplapply}} by a call to \code{lapply}, which
#' significantly reduces the overhead of debugging. Note that invoking this
#' option overrides all other parallelization arguments.
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
#' performing targeted minimum loss estimation. Cross-validated estimates have
#' been demonstrated to allow relaxation of certain theoretical conditions and
#' and accommodate the construction of more conservative variance estimates.
#' @param g_lib A \code{character} vector specifying the library of machine
#' learning algorithms for use in fitting the propensity score P(A = a | W).
#' @param Q_lib A \code{character} vector specifying the library of machine
#' learning algorithms for use in fitting the outcome regression E[Y | A,W].
#' @param ... Additional arguments to be passed to \code{\link[drtmle]{drtmle}}
#' in computing the targeted minimum loss estimator of the average treatment
#' effect.
#'
#' @importFrom SummarizedExperiment assay colData rowData SummarizedExperiment
#' @importFrom BiocParallel register bplapply bpprogressbar DoparParam
#' @importFrom future plan multiprocess sequential
#' @importFrom doFuture registerDoFuture
#' @importFrom tibble as_tibble
#'
#' @return S4 object of class \code{biotmle}, inheriting from
#' \code{SummarizedExperiment}, with additional slots \code{tmleOut} and
#' \code{call}, among others, containing TML estimates of the ATE of exposure
#' on biomarker expression.
#'
#' @export biomarkertmle
#'
#' @examples
#' library(dplyr)
#' library(biotmleData)
#' library(SuperLearner)
#' library(SummarizedExperiment)
#' data(illuminaData)
#'
#' colData(illuminaData) <- colData(illuminaData) %>%
#' data.frame() %>%
#' dplyr::mutate(age = as.numeric(age > median(age))) %>%
#' DataFrame()
#' benz_idx <- which(names(colData(illuminaData)) %in% "benzene")
#'
#' biomarkerTMLEout <- biomarkertmle(
#' se = illuminaData[1:2, ],
#' varInt = benz_idx,
#' parallel = FALSE,
#' g_lib = c("SL.mean", "SL.glm"),
#' Q_lib = c("SL.bayesglm", "SL.glm")
#' )
biomarkertmle <- function(se,
varInt,
normalized = TRUE,
ngscounts = FALSE,
parallel = TRUE,
bppar_type = NULL,
future_param = NULL,
bppar_debug = FALSE,
cv_folds = 1,
g_lib = c(
"SL.mean", "SL.glm", "SL.bayesglm", "SL.glmnet"
),
Q_lib = c(
"SL.mean", "SL.bayesglm", "SL.earth", "SL.xgboost"
),
...) {
# catch input and invoke S4 class constructor for "bioTMLE" object
call <- match.call(expand.dots = TRUE)
biotmle <- .biotmle(
SummarizedExperiment(
assays = list(expMeasures = assay(se)),
rowData = rowData(se),
colData = colData(se)
),
call = call,
tmleOut = tibble::as_tibble(matrix(NA, 10, 10)),
topTable = tibble::as_tibble(matrix(NA, 10, 10))
)
# invoke the voom transform from LIMMA if next-generation sequencing data)
if (ngscounts) {
voom_out <- rnaseq_ic(biotmle)
voom_exp <- 2^(voom_out$E)
assay(se) <- voom_exp
}
# set up parallelization based on input
doFuture::registerDoFuture()
if (parallel == TRUE) {
if (!is.null(future_param)) {
set_future_param <- parse(text = paste0("future", "::", future_param))
future::plan(eval(set_future_param))
} else {
future::plan(future::multiprocess)
}
} else if (parallel == FALSE) {
message(paste(
"Note: Sequential evaluation over many probes may take a long time."
))
future::plan(future::sequential)
}
if (!is.null(bppar_type)) {
bp_type <- eval(parse(text = paste0(
"BiocParallel", "::",
bppar_type, "()"
)))
} else {
bp_type <- BiocParallel::DoparParam()
}
BiocParallel::bpprogressbar(bp_type) <- TRUE
BiocParallel::register(bp_type, default = TRUE)
# TMLE procedure to identify biomarkers based on an EXPOSURE
if (!ngscounts && !normalized) {
# median normalization
exp_normed <- limma::normalizeBetweenArrays(as.matrix(assay(se)),
method = "scale"
)
Y <- tibble::as_tibble(t(exp_normed))
} else {
Y <- tibble::as_tibble(t(as.matrix(assay(se))))
}
# simple sanity check of whether Y includes array values
if (!all(apply(Y, 2, class) == "numeric")) {
stop("Warning - values in Y do not appear to be numeric.")
}
# exposure / treatment
A <- as.numeric(colData(se)[, varInt])
# baseline covariates
W <- tibble::as_tibble(as.data.frame(colData(se)[, -varInt]))
if (dim(W)[2] == 0) {
W <- as.numeric(rep(1, length(A)))
}
# coerce matrix of baseline covariates to numeric
if (!all(apply(W, 2, class) == "numeric")) {
W <- tibble::as_tibble(apply(W, 2, as.numeric))
}
# perform multi-level TMLE (of the ATE) for genes as Y
if (!bppar_debug) {
biomarkertmle_out <- BiocParallel::bplapply(Y[, seq_along(Y)],
exp_biomarkertmle,
W = W,
A = A,
g_lib = g_lib,
Q_lib = Q_lib,
cv_folds = cv_folds,
...
)
} else {
biomarkertmle_out <- lapply(Y[, seq_along(Y)],
exp_biomarkertmle,
W = W,
A = A,
g_lib = g_lib,
Q_lib = Q_lib,
cv_folds = cv_folds,
...
)
}
biomarkertmle_params <- do.call(c, lapply(biomarkertmle_out, `[[`, "param"))
biomarkertmle_eifs <- do.call(
cbind.data.frame,
lapply(biomarkertmle_out, `[[`, "eif")
)
biotmle@ateOut <- as.numeric(biomarkertmle_params)
if (!ngscounts) {
biotmle@tmleOut <- tibble::as_tibble(t(as.matrix(biomarkertmle_eifs)))
} else {
voom_out$E <- t(as.matrix(biomarkertmle_eifs))
biotmle@tmleOut <- voom_out
}
return(biotmle)
}
################################################################################
#' TMLE procedure using ATE for Biomarker Identication from Exposure
#'
#' This function performs influence curve-based estimation of the effect of an
#' exposure on biological expression values associated with a given biomarker,
#' controlling for a user-specified set of baseline covariates.
#'
#' @param Y A \code{numeric} vector of expression values for a given biomarker.
#' @param A A \code{numeric} vector of discretized exposure vector (e.g., from
#' a design matrix whose effect on expression values is of interest.
#' @param W A \code{Matrix} of \code{numeric} values corresponding to baseline
#' covariates to be marginalized over in the estimation process.
#' @param g_lib A \code{character} vector identifying the library of learning
#' algorithms to be used in fitting the propensity score P[A = a | W].
#' @param Q_lib A \code{character} vector identifying the library of learning
#' algorithms to be used in fitting the outcome regression E[Y | A, W].
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
#' performing targeted minimum loss estimation. Cross-validated estimates are
#' more robust, allowing relaxing of theoretical conditions and construction
#' of conservative variance estimates.
#' @param ... Additional arguments passed to \code{\link[drtmle]{drtmle}} in
#' computing the targeted minimum loss estimator of the average treatment
#' effect.
#'
#' @importFrom assertthat assert_that
#' @importFrom drtmle drtmle
#'
#' @return TMLE-based estimate of the relationship between biomarker expression
#' and changes in an exposure variable, computed iteratively and saved in the
#' \code{tmleOut} slot in a \code{biotmle} object.
exp_biomarkertmle <- function(Y,
A,
W,
g_lib,
Q_lib,
cv_folds,
...) {
# check the case that Y is passed in as a column of a data.frame
if (any(class(Y) == "data.frame")) Y <- as.numeric(unlist(Y[, 1]))
if (any(class(A) == "data.frame")) A <- as.numeric(unlist(A[, 1]))
assertthat::assert_that(length(unique(A)) > 1)
# fit standard (possibly CV) TML estimator (n.b., guard = NULL)
a_0 <- sort(unique(A[!is.na(A)]))
suppressWarnings(
tmle_fit <- drtmle::drtmle(
Y = Y,
A = A,
W = W,
a_0 = a_0,
SL_g = g_lib,
SL_Q = Q_lib,
cvFolds = cv_folds,
stratify = TRUE,
guard = NULL,
parallel = FALSE,
use_future = FALSE,
...
)
)
# compute ATE and estimated EIF by delta method
ate_tmle <- tmle_fit$tmle$est[seq_along(a_0)[-1]] - tmle_fit$tmle$est[1]
eif_tmle_delta <- tmle_fit$ic$ic[, seq_along(a_0)[-1]] - tmle_fit$ic$ic[, 1]
# return only highest contrast (e.g., a[1] v a[5]) if many contrasts
if (!is.vector(eif_tmle_delta)) {
param_out <- ate_tmle[length(ate_tmle)]
eif_out <- eif_tmle_delta[, ncol(eif_tmle_delta)] +
ate_tmle[length(ate_tmle)]
} else {
param_out <- ate_tmle
eif_out <- eif_tmle_delta + ate_tmle
}
assertthat::assert_that(is.vector(eif_out))
# output
out <- list(param = param_out, eif = eif_out)
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.