#' Fit beta-binomial across cell types
#'
#' This function performs additional inference on the allelic ratio
#' across cell types, giving posterior mean and credible
#' intervals per cell type. A Cauchy prior is centered for
#' each cell type on the allelic ratio from the fused lasso
#' across all genes in the gene cluster
#' (or using a weighted means if the fused lasso is not provided).
#'
#' @param sce A SingleCellExperiment containing assays
#' (\code{"ratio"}, \code{"counts"}) and colData (\code{"x"},
#' \code{"part"})
#' @param formula The same \code{\link[stats]{formula}} object
#' used in \code{\link[airpart]{fusedLasso}} or used in \code{\link[splines]{ns}}
#' when want to detect smooth continuous-axis estimates. eg. formula <- ratio ~ ns(x, df = 5)
#' @param nogroup Indicate whether there is previous group step. Default is FALSE represents either
#' Generalized fused lasso or nonparametric method is used to derive partition. nogroup == TRUE means
#' the hypothesis that groups of cell types sharing a common regulatory context is not valid
#' @param level the level of credible interval (default is 0.95)
#' @param DAItest Indicate whether to do Likelihood Ratio test on differential allelic imbalance(DAI)
#' or equivalent to heterogeneity.
#' @param ... arguments to pass to \code{\link[apeglm]{apeglm}}
#' functions
#'
#' @return posterior mean (\code{"ar"}) for allelic ratio
#' estimate is returned in the rowData for each cell type,
#' as well as the \code{"s"} value, \code{"fsr"} false sign rate and
#' credible interval (\code{"lower"} and \code{"upper"}). One can use \code{"fsr"} < 0.005 or
#' credible intervals contain 0.5 or not for AI test significance. \code{"p.value"} shows DAI test result
#' and \code{"adj.p.value"} is false discovery rate corrected p values.
#'
#' @examples
#'
#' sce <- makeSimulatedData()
#' sce <- preprocess(sce)
#' sce <- geneCluster(sce, G = seq_len(4))
#' sce_sub <- wilcoxExt(sce, genecluster = 1)
#' sce_sub <- allelicRatio(sce_sub, DAItest = TRUE)
#' @importFrom emdbook dbetabinom
#' @importFrom stats as.formula p.adjust pchisq
#' @importFrom splines ns
#'
#' @export
allelicRatio <- function(sce, formula, nogroup = FALSE, level = 0.95, DAItest = FALSE, ...) {
if (missing(formula)) {
formula <- ratio ~ p(x, pen = "gflasso")
}
add_covs <- grep("p\\(|ns\\(",
attr(terms(formula), "term.labels"),
invert = TRUE, value = TRUE
)
# derive df for continuous variables and return NULL for discrete variable
if(grepl("ns\\(",attr(terms(formula), "term.labels"))){
splines = grep("ns\\(",attr(terms(formula), "term.labels"), value = TRUE)
df = regmatches(splines, regexec('=(.*?)\\)', splines))[[1]][2] %>% as.numeric()
} else {df = NULL}
if (isTRUE(nogroup) | !is.null(df)){
sce$part = sce$x
metadata(sce)$partition <- data.frame(part = seq_len(nlevels(sce$x)),x = levels(sce$x))
}
# derive alternative hypothesis result
x <- designMatrix(sce, add_covs, df)
res <- part(sce, x, add_covs, level = level, df = df, ...)
if(isTRUE(DAItest)){
npart = nlevels(sce$part)
# reconstruct coefficient matrix
coef = res$beta[,!duplicated(res$beta[1,])]
ratio <- inv.logit(coef %*% t(x) + res$offset)
loglik1 <- rowSums(emdbook::dbetabinom(assays(sce)[["a1"]], ratio, assays(sce)[["counts"]], res$theta, log=TRUE))
# derive null hypothesis result
sce0 <- sce
sce0$part = factor(1)
x0 <- designMatrix(sce0, add_covs)
res0 <- part(sce0, x0, add_covs, level = level, ...)
coef0 = res0$beta[,!duplicated(res0$beta[1,])]
ratio0 <- inv.logit(coef0 %*% t(x0) + res0$offset)
loglik0 <- rowSums(emdbook::dbetabinom(assays(sce0)[["a1"]], ratio0, assays(sce0)[["counts"]], res0$theta, log=TRUE))
# Construct LR
LR = -2 * (loglik0-loglik1)
rowData(sce)$p.value = pchisq(LR, npart-1, lower.tail = FALSE)
rowData(sce)$adj.p.value = p.adjust(rowData(sce)$p.value, method = "fdr")
}
if(is.null(df)){
mean <- inv.logit(res$beta)
s <- res$s
fsr <- res$fsr
lower <- res$lower
upper <- res$upper
est <- data.frame(
round(mean, 3), format(s, digits = 3), format(fsr, digits = 3),
round(lower, 3), round(upper, 3)
) %>%
`rownames<-`(rownames(sce))
}else{
## currently only return estimates for continuous case
ratio <- inv.logit(res$beta %*% t(x))
mean <- ratio[,!duplicated(ratio[1,])] %>% `colnames<-`(paste("ar", levels(sce$x), sep = "_"))
est <-data.frame(round(mean, 3))%>% `rownames<-`(rownames(sce))
}
est_sub <- est[, setdiff(colnames(est), colnames(rowData(sce)))]
merged <- merge(rowData(sce), est_sub, by = 0) %>% DataFrame()
order <- match(rownames(sce), merged$Row.names)
rowData(sce) <- merged[order, ][-1] %>%
`rownames<-`(rownames(sce))
sce
}
## construct design matrix
designMatrix <- function(sce, add_covs, df =NULL){
if(!is.null(df)){
X <-list(part = ns(sce$part %>% as.numeric(),df, intercept=TRUE))
}else{
if (nlevels(sce$part) == 1) {
X <- list(part = sce$part)
} else {
X <- list(part = model.matrix(~ part + 0, colData(sce)))
}
}
if (length(add_covs) > 0) {
for (v in add_covs) {
X[[v]] <- model.matrix(as.formula(paste("~", add_covs, "+0")), colData(sce))
colnames(X[[v]]) <- levels(sce[[v]])
}
}
X <- do.call(cbind, X)
return(X)
}
part <-function(sce, x, add_covs, level, df = NULL, ...){
# rough initial estimate of dispersion
theta.hat <- matrix(rep(100, dim(sce)[1]))
maxDisp <- 75
niter <- 5
for (i in seq_len(niter)) {
param <- cbind(theta.hat[, 1], assays(sce)[["counts"]])
fit.mle <- apeglm(
Y = assays(sce)[["a1"]], x = x, log.lik = NULL,
param = param, no.shrink = TRUE, log.link = FALSE,
method = "betabinCR", interval.level = level,
)
theta.hat <- bbEstDisp(
success = assays(sce)[["a1"]],
size = assays(sce)[["counts"]], x = x,
beta = fit.mle$map, minDisp = .01, maxDisp = maxDisp, se = TRUE
)
}
# construct offset if adjusted for batch, and sum them over different batch
theta <- theta.hat[, 1]
offset <- matrix(0, ncol = ncol(sce), nrow = nrow(sce))
if (length(add_covs) > 0) {
for (v in add_covs) {
if (nrow(sce) == 1) {
batch <- fit.mle$map[, match(levels(sce[[v]]), colnames(x))] %>%
as.matrix() %>%
t()
} else {
batch <- fit.mle$map[, match(levels(sce[[v]]), colnames(x))]
}
feat <- sce[[v]]
offset <- offset + batch[, match(feat, colnames(batch))]
}
}
if (nrow(sce) == 1) {
param <- cbind(theta, assays(sce)[["counts"]])
coef <- 0
res <- adp.shrink(sce, fit.mle, param, level, offset, coef, log.lik = NULL, method = "betabinCR", ...)
} else {
se <- theta.hat[, 2]
## approximate shrinkage using a formula in the style of
# Efron and Morris's hierarchical model of normals
log_disp <- log(theta)
sigma_sampling2 <- mean(se^2)
sigma_prior2 <- max((var(log_disp) - sigma_sampling2), 0)
B <- as.numeric((sigma_sampling2) / (sigma_prior2 + sigma_sampling2))
log_disp_0 <- mean(log_disp)
theta2 <- exp((1 - B) * log_disp + B * log_disp_0)
param <- cbind(theta2, assays(sce)[["counts"]])
## estimating prior mean
if ("coef" %in% colnames(colData(sce))) {
## use fused lasso estimates as prior
coef <- unique(data.frame(x = sce$part, coef = sce$coef))$coef
} else {
## use weighted mean as prior
cl_ratio <- as.vector(unlist(assays(sce)[["ratio"]]))
cl_total <- as.vector(unlist(assays(sce)[["counts"]]))
dat <- data.frame(
ratio = cl_ratio,
part = factor(rep(sce$part, each = length(sce))),
cts = cl_total
)
summary <- dat %>%
group_by(.data$part) %>%
summarise(coef = weighted.mean(.data$ratio,
.data$cts,
na.rm = TRUE
)) %>%
as.data.frame()
coef <- merge(summary, unique(data.frame(part = sce$part, x = sce$x)),
by = "part", sort = FALSE
)$coef
coef <- log(coef / (1 - coef))
}
if(!is.null(df)){
res <- adp.shrink(sce, fit.mle, param, level, offset, coef, log.lik = betabinom.log.lik, method = "general", df =df, ...)
}else{
## Shrink one cell type/time
res <- adp.shrink(sce, fit.mle, param, level, offset, coef, log.lik = betabinom.log.lik, method = "general", ...)
}
}
return(res)
}
## Betabinomial log link
betabinom.log.lik <- function(y, x, beta, param, offset) {
xbeta <- x %*% beta + offset
p.hat <- (1 + exp(-xbeta))^-1
emdbook::dbetabinom(y,
prob = p.hat, size = param[-1],
theta = param[1], log = TRUE
)
}
## derive adaptive shrinkage with prior
adp.shrink <- function(sce, fit.mle, param, level, offset, coef, log.lik, method, df=NULL, ...) {
npart <- nlevels(sce$part)
nct <- nlevels(sce$x)
##
group <- unique(data.frame(x = sce$x, part = sce$part))
## store results
beta <- matrix(0, nrow = length(sce), ncol = nct) %>%
`colnames<-`(paste("ar", group$x, sep = "_"))
s <- matrix(0, nrow = length(sce), ncol = nct) %>%
`colnames<-`(paste("svalue", group$x, sep = "_"))
fsr <- matrix(0, nrow = length(sce), ncol = nct) %>%
`colnames<-`(paste("fsr", group$x, sep = "_"))
lower <- matrix(0, nrow = length(sce), ncol = nct) %>%
`colnames<-`(paste("lower", paste0((1 - level) * 50, "pct"),
group$x,
sep = "_"
))
upper <- matrix(0, nrow = length(sce), ncol = nct) %>%
`colnames<-`(paste("upper", paste0(100 - (1 - level) * 50, "pct"),
group$x,
sep = "_"
))
if(!is.null(df)){
x <- designMatrix(sce, add_covs = NULL, df)
fit2 <- apeglm(
Y = assays(sce)[["a1"]], x = x, log.lik = NULL,
param = param, no.shrink = TRUE, offset = offset, log.link = FALSE,
method = "betabinCR", interval.level = level,
)
beta = fit2$map
return(list(beta = beta, offset = offset, theta = param[,1]))
}else{
part <- group$part
if (nlevels(sce$part) == 1) {
x <- data.frame(part = as.numeric(sce$part)) %>% as.matrix()
} else {
x <- model.matrix(~ part + 0, colData(sce))
}
for (j in seq_len(npart)) {
fit.post <- apeglm(
Y = assays(sce)[["a1"]], x = x, param = param, log.lik = log.lik,
coef = j, interval.level = level, offset = offset,
log.link = FALSE, method = method,
prior.control = list(
no.shrink = setdiff(seq_len(ncol(x)), j), prior.mean = coef,
prior.scale = 1, prior.df = 1,
prior.no.shrink.mean = 0, prior.no.shrink.scale = 15
), ...
)
beta[, which(part == levels(part)[j])] <- fit.post$map[, j]
s[, which(part == levels(part)[j])] <- fit.post$svalue
fsr[, which(part == levels(part)[j])] <- fit.post$fsr
lower[, which(part == levels(part)[j])] <- inv.logit(fit.post$interval[, 1])
upper[, which(part == levels(part)[j])] <- inv.logit(fit.post$interval[, 2])
}
return(list(beta = beta, s = s, fsr = fsr, lower = lower, upper = upper, offset = offset, theta = param[,1]))
}
}
inv.logit <- function(x) (1 + exp(-x))^-1
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.