#' Test for differential abundance: method 'censcyt-DA-censored-GLMM'
#'
#' Calculate tests for differential abundance of cell populations using method
#' 'censcyt-DA-censored-GLMM'
#'
#' Calculates tests for differential abundance of clusters, using generalized linear mixed
#' models (GLMMs) where a covariate is subject to right censoring.
#'
#'
#'
#' @param formula Model formula object, see \code{\link[diffcyt]{testDA_GLMM}} and for more
#' details \code{\link[diffcyt]{createFormula}}. Be aware of the special format required
#' for the censored covariate: instead of just the covariate name (e.g. 'X') the
#' columnname of the data being an event indicator (e.g. 'I', with 'I' = 1 if
#' 'X' is observed and 'I' = 0 if 'X' is censored, ) needs to specified as well.
#' The notation in the formula is then 'Surv(X,I)'.
#'
#' @param mi_reps number of imputations in multiple imputation. default = 10.
#'
#' @param imputation_method which method should be used in the imputation step. One of
#' 'km','km_exp','km_wei','km_os', 'rs', 'mrl', 'cc', 'pmm'. See details. default = 'km'.
#'
#' @param BPPARAM specify parallelization option as one of
#' \code{\link{BiocParallelParam}} if 'BiocParallel' is available
#' otherwise no parallelization.
#' e.g. \code{\link[BiocParallel]{MulticoreParam-class}}(workers=2) for parallelization
#' with two cores. Default is \code{\link[BiocParallel]{SerialParam-class}}()
#' (no parallelization).
#'
#' @param verbose Logical.
#'
#' @inheritParams diffcyt::testDA_GLMM
#'
#' @inherit diffcyt::testDA_GLMM return
#'
#' @details
#' The same underlying testing as described in \code{\link[diffcyt]{testDA_GLMM}} is
#' applied here. The main difference is that multiple imputation is used to
#' handle a censored covariate. In short, multiple imputation consists of three
#' steps: imputation, analysis and pooling. In the imputation step multiple complete
#' data sets are generated by imputation. The imputed data is then analysed in
#' the second step and the results are combined in the third step. See also \code{\link[mice]{pool}}.
#' The imputation in the first step is specific for censored data in contrast to
#' the 'normal' use of multiple imputation where data is missing.
#' Alternatively the samples with censored data can be removed (complete case analysis)
#' or the censored values can be treated as missing (predictive mean matching).
#'
#' Possible imputation methods in argument 'imputation_method' are:
#' \describe{
#' \item{'km'}{Kaplan Meier imputation is similar to 'rs' (Risk set imputation)
#' but the random draw is according to the survival function of
#' the respective risk set. The largest value is treated as observed
#' to obtain a complete survival function. (Taylor et al. 2002)}
#' \item{'km_exp'}{The same as 'km' but if the largest value is censored the
#' tail of the survival function is modeled as an exponential
#' distribution where the rate parameter is obtained by fixing
#' the distribution to the last observed value.
#' See (Moeschberger and Klein, 1985).}
#' \item{'km_wei'}{The same as 'km' but if the largest value is censored the
#' tail of the survival function is modeled as an weibull
#' distribution where the parameters are obtained by MLE fitting on
#' the whole data. See (Moeschberger and Klein, 1985).}
#' \item{'km_os'}{The same as 'km' but if the largest value is censored the
#' tail of the survival function is modeled by order statistics.
#' See (Moeschberger and Klein, 1985).}
#' \item{'rs'}{Risk Set imputation replaces the censored values with a random
#' draw from the risk set of the respective censored value. (Taylor et al. 2002)}
#' \item{'mrl'}{Mean Residual Life (Conditional multiple imputation, See Atem
#' et al. 2017) is a multiple imputation procedure that bootstraps
#' the data and imputes the censored values by replacing them with their
#' respective mean residual life.}
#' \item{'cc'}{complete case (listwise deletion) analysis removes incomlete samples.}
#' \item{'pmm'}{predictive mean matching treats censored values as missing and
#' uses predictive mean matching from \code{\link[mice]{mice}}.}
#' }
#'
#' @references {
#' A Comparison of Several Methods of Estimating the Survival Function When
#' There is Extreme Right Censoring (M. L. Moeschberger and John P. Klein, 1985)
#'
#' Improved conditional imputation for linear regression with a randomly
#' censored predictor (Atem et al. 2017)
#'
#' Survival estimation and testing via multiple imputation (Taylor et al. 2002)
#' }
#' @export
#' @import broom.mixed
#' @importFrom stats plogis qlogis qnorm p.adjust
#' @importFrom edgeR calcNormFactors
#'
#' @examples
#' # create small data set with 2 differential clusters with 10 samples.
#' d_counts <- simulate_multicluster(alphas = runif(10,1e4,1e5),
#' sizes = runif(10,1e4,1e5),
#' nr_diff = 2,
#' group=2,
#' return_summarized_experiment = TRUE)$counts
#'
#' # extract covariates data.frame
#' experiment_info <- SummarizedExperiment::colData(d_counts)
#' # add censoring
#' experiment_info$status <- sample(c(0,1),size=10,replace = TRUE,prob = c(0.3,0.7))
#' experiment_info$covariate[experiment_info$status == 0] <-
#' runif(10-sum(experiment_info$status),
#' min=0,
#' max=experiment_info$covariate[experiment_info$status == 0])
#'
#' # create model formula object
#' da_formula <- createFormula(experiment_info,
#' cols_fixed = c("covariate", "group_covariate"),
#' cols_random = "sample",event_indicator = "status")
#'
#' # create contrast matrix
#' contrast <- diffcyt::createContrast(c(0, 1, 0))
#'
#' # run testing with imputation method 'km'
#' outs <- testDA_censoredGLMM(d_counts = d_counts, formula = da_formula,
#' contrast = contrast, mi_reps = 2, imputation_method = "km")
#' diffcyt::topTable(outs)
#' # differential clusters:
#' which(!is.na(SummarizedExperiment::rowData(d_counts)$paired))
#'
testDA_censoredGLMM <- function(d_counts, formula, contrast, mi_reps = 10,
imputation_method = c("km","km_exp","km_wei","km_os","rs","mrl","cc","pmm"),
min_cells = 3,
min_samples = NULL,
normalize = FALSE,
norm_factors = "TMM",
BPPARAM=BiocParallel::SerialParam(),
verbose = FALSE
)
{
if (is.null(min_samples)) {
min_samples <- ncol(d_counts) / 2
}
# if censored covariate should be binarized for testing (does not influence imputation)
binarise_covariate <- stringr::str_detect(imputation_method,"_bin")
imputation_method <- ifelse(binarise_covariate,stringr::str_split(imputation_method,"_bin")[[1]][1],imputation_method)
imputation_method <- match.arg(imputation_method)
BPPARAM <- if(requireNamespace("BiocParallel",quietly = TRUE)){BPPARAM} else{NULL}
# variable names from the given formula
cmi_input <- extract_variables_from_formula(formula$formula)
if (!is.numeric(formula$data[[cmi_input$censored_variable]])){
stop(paste0("The censored variable '",cmi_input$censored_variable, "' is not numeric."), call. = FALSE)
}
# create formula for fitting
formula_glmm <- create_glmm_formula(formula$formula)
counts <- SummarizedExperiment::assays(d_counts)[["counts"]]
cluster_id <- SummarizedExperiment::rowData(d_counts)$cluster_id
# only keep counts with more than the minimum number of cells
counts_to_keep <- counts >= min_cells
# only keep clusters with more than the minimum number of samples
rows_to_keep <- apply(counts_to_keep, 1, function(r) sum(r) >= min_samples)
# subset counts and cluster_id's
counts <- counts[rows_to_keep, , drop = FALSE]
cluster_id <- cluster_id[rows_to_keep]
# normalization factors
if (normalize & norm_factors == "TMM") {
norm_factors <- calcNormFactors(counts, method = "TMM")
}
if (normalize) {
weights <- colSums(counts) / norm_factors
} else {
weights <- colSums(counts)
}
if (ncol(contrast) == 1 & nrow(contrast) > 1) {
contrast <- t(contrast)
}
if (sum(contrast) != 1 | any(!(contrast %in% c(0,1)))){
stop("General Linear Hypotheses testing is currently not supported in
'testDA_GLMM', make sure to only have one '1' in 'contrast'.", call. = FALSE)
}
if (verbose) message(paste(sum(formula$data[[cmi_input[["censoring_indicator"]]]] == 0),
"of", dim(formula$data)[1], "values are censored"))
# start fitting by iterating through the clusters, use normal lapply if
# 'BiocParallel' isn't install otherwise use bplapply
p_vals_ls <- maybe_parallel_lapply(seq_along(cluster_id),
BPPARAM=BPPARAM,
function(i) {
# data preparation
y <- counts[i, ]/weights
data_i <- cbind(y, weights, formula$data)
colnames(data_i)[c(1,2)] <- c(cmi_input$response,"weights")
# do conditional multiple imputation
if (imputation_method %in% c("mrl","rs","km", "km_exp","km_wei","km_os","pmm")){
imputation_method_cmi <- ifelse(binarise_covariate,paste0(imputation_method,"_bin"),imputation_method)
out_test <- tryCatch(suppressMessages(suppressWarnings(
conditional_multiple_imputation(
data = data_i,
formula = formula$formula,
mi_reps = mi_reps,
imputation_method = imputation_method_cmi,
regression_type = "glmer",
family = "binomial",
verbose = verbose,
weights = "weights",
contrasts = contrast
))),
error=function(e) NA)
# pooling of results from multiple imputation
pooled_results <- mice::pool(out_test$fits)
# how many imputations under quadratic rule
fraction_missing_information_u <- plogis(qlogis(max(pooled_results$pooled$fmi)) +
qnorm(0.975) * sqrt(2/pooled_results$m))
that_many_imps <- ceiling(1 + 1/2*(fraction_missing_information_u/0.05)^2)
# p-value
p_val <- tryCatch(summary(pooled_results)$p.value[ which(contrast == 1)],
error=function(e) NA)
} # complete case fitting and testing
else if (imputation_method == "cc"){
p_val <- tryCatch({
fit <- suppressMessages(suppressWarnings(complete_case(
data = data_i,censored_variable = cmi_input[["censored_variable"]],
censoring_indicator = cmi_input[["censoring_indicator"]],
formula = formula_glmm,regression_type = "glmer",
weights = "weights",family = "binomial",binarise_covariate)$fits))
test <- multcomp::glht(fit, contrast)
summary(test)$test$pvalues
},error=function(e) NA)
}
that_many_imps <- ifelse(exists("that_many_imps"),that_many_imps,NA)
return(c(p_val,that_many_imps))
})
pvals_imps_mat <- purrr::reduce(p_vals_ls,rbind)
that_many_imps_max <- max(pvals_imps_mat[,2])
if (verbose){
message(paste("suggested number of imputations:\t",
that_many_imps_max,
"\n"))
}
p_vals <- pvals_imps_mat[,1]
# fdr correction
p_adj <- p.adjust(p_vals, method = "fdr")
stopifnot(length(p_vals) == length(p_adj))
row_data <- data.frame(cluster_id = as.character(cluster_id),
p_val = p_vals,
p_adj = p_adj,
stringsAsFactors = FALSE)
# return NA if cluster has been excluded from testing because of too few observations
row_data <- suppressMessages(suppressWarnings(
dplyr::right_join(row_data,
data.frame(SummarizedExperiment::rowData(d_counts)))))
res <- SummarizedExperiment::SummarizedExperiment(
assays = list(counts = SummarizedExperiment::assay(d_counts)),
rowData = row_data,
colData = SummarizedExperiment::colData(d_counts))
# return normalization factors in 'metadata'
if (normalize) {
metadata(res)$norm_factors <- norm_factors
}
return(res)
}
# if namespace 'BiocParallel' is available run 'BiocParallel::bplapply', otherwise
# run lapply
maybe_parallel_lapply <- function(X, FUN, BPPARAM) {
if (requireNamespace("BiocParallel", quietly = TRUE)) {
BiocParallel::bplapply(X, FUN, BPPARAM=BPPARAM)
} else {
lapply(X, FUN)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.