#' Shrink the overdispersion estimates
#'
#' Low-level function to shrink a set of overdispersion
#' estimates following the quasi-likelihood and Empirical
#' Bayesian framework.
#'
#' @param disp_est vector of overdispersion estimates
#' @param gene_means vector of average gene expression values. Used
#' to fit `disp_trend` if that is `NULL`.
#' @param df degrees of freedom for estimating the Empirical Bayesian
#' variance prior. Can be length 1 or same length as `disp_est` and
#' `gene_means`.
#' @param disp_trend vector with the dispersion trend. If `NULL` or `TRUE` the
#' dispersion trend is fitted using a (weighted) local median fit.
#' Default: `TRUE`.
#' @param ql_disp_trend a logical to indicate if a second abundance trend
#' using splines is fitted for the quasi-likelihood dispersions.
#' Default: `NULL` which means that the extra fit is only done if
#' enough observations are present.
#' @param ... additional parameters for the `loc_median_fit()` function
#' @inheritParams glm_gp
#'
#'
#'
#' @details
#' The function goes through the following steps
#' 1. Fit trend between overdispersion MLE's and the average
#' gene expression. Per default it uses the `loc_median_fit()`
#' function.
#' 2. Convert the overdispersion MLE's to quasi-likelihood
#' dispersion estimates by fixing the trended dispersion as
#' the "true" dispersion value:
#' \eqn{disp_{ql} = (1 + mu * disp_{mle}) / (1 + mu * disp_{trend})}
#' 3. Shrink the quasi-likelihood dispersion estimates using
#' Empirical Bayesian variance shrinkage (see Smyth 2004).
#'
#' @return the function returns a list with the following elements
#' \describe{
#' \item{dispersion_trend}{the dispersion trend provided by `disp_trend` or the
#' local median fit.}
#' \item{ql_disp_estimate}{the quasi-likelihood dispersion estimates based on
#' the dispersion trend, `disp_est`, and `gene_means`}
#' \item{ql_disp_trend}{the `ql_disp_estimate` still might show a trend with
#' respect to `gene_means`. If `ql_disp_trend = TRUE` a spline is used to
#' remove this secondary trend. If `ql_disp_trend = FALSE` it corresponds
#' directly to the dispersion prior}
#' \item{ql_disp_shrunken}{the shrunken quasi-likelihood dispersion estimates.
#' They are shrunken towards `ql_disp_trend`.}
#' \item{ql_df0}{the degrees of freedom of the empirical Bayesian shrinkage.
#' They correspond to spread of the `ql_disp_estimate`'s}
#' }
#'
#' @examples
#' Y <- matrix(rnbinom(n = 300 * 4, mu = 6, size = 1/4.2), nrow = 300, ncol = 4)
#' disps <- sapply(seq_len(nrow(Y)), function(idx){
#' overdispersion_mle(Y[idx, ])$estimate
#' })
#' shrink_list <- overdispersion_shrinkage(disps, rowMeans(Y), df = ncol(Y) - 1,
#' disp_trend = FALSE, ql_disp_trend = FALSE)
#'
#' plot(rowMeans(Y), shrink_list$ql_disp_estimate)
#' lines(sort(rowMeans(Y)), shrink_list$ql_disp_trend[order(rowMeans(Y))], col = "red")
#' points(rowMeans(Y), shrink_list$ql_disp_shrunken, col = "blue", pch = 16, cex = 0.5)
#'
#'
#' @references
#' * Lund, S. P., Nettleton, D., McCarthy, D. J., & Smyth, G. K. (2012). Detecting differential expression
#' in RNA-sequence data using quasi-likelihood with shrunken dispersion estimates. Statistical
#' Applications in Genetics and Molecular Biology, 11(5).
#' [https://doi.org/10.1515/1544-6115.1826](https://doi.org/10.1515/1544-6115.1826).
#' * Smyth, G. K. (2004). Linear models and empirical bayes methods for assessing differential expression
#' in microarray experiments. Statistical Applications in Genetics and Molecular Biology, 3(1).
#' [https://doi.org/10.2202/1544-6115.1027](https://doi.org/10.2202/1544-6115.1027)
#'
#' @seealso \code{limma::squeezeVar()}
#' @export
overdispersion_shrinkage <- function(disp_est, gene_means, df,
disp_trend = TRUE,
ql_disp_trend = NULL,
...,
verbose = FALSE){
est_value <- ! (is.na(disp_est) | is.na(gene_means) | is.na(df))
stopifnot(length(disp_est) == length(gene_means))
if(is.null(disp_trend) || isTRUE(disp_trend)){
if(verbose){ message("Fit dispersion trend") }
disp_trend <- rep(NA, length(disp_est))
disp_trend[est_value] <- loc_median_fit(gene_means[est_value], y = disp_est[est_value], ...)
}else if(isFALSE(disp_trend)) {
disp_trend <- rep(NA, length(disp_est))
disp_trend[est_value] <- rep(mean(disp_est, na.rm = TRUE), sum(est_value))
}
if(verbose){ message("Shrink dispersion estimates") }
# Lund et al. 2012. Equation 4. Not ideal for very small counts
# ql_disp <- rowSums2(compute_gp_deviance_residuals_matrix(Y, Mu, disp_trend)^2) / df
# The following equations are used to go between quasi-likelihood and normal representation
# variance = (gene_means + disp_est * gene_means^2)
# variance = ql_disp * (gene_means + disp_trend * gene_means^2)
ql_disp <- (1 + gene_means * disp_est) / (1 + gene_means * disp_trend)
var_pr <- variance_prior(ql_disp, df, covariate = gene_means, abundance_trend = ql_disp_trend)
list(dispersion_trend = disp_trend, ql_disp_estimate = ql_disp,
ql_disp_trend = var_pr$variance0, ql_disp_shrunken = var_pr$var_post,
ql_df0 = var_pr$df0)
}
#' Estimate the scale and df for a Inverse Chisquare distribution that generate the true gene variances
#'
#' This function implements Smyth's 2004 variance shrinkage. It also supports covariates that are
#' fitted to log(s2) with natural splines. This is based on the 2012 Lund et al. quasi-likelihood
#' paper.
#'
#' @param s2 vector of observed variances. Must not contain \code{0}'s.
#' @param df vector or single number with the degrees of freedom
#' @param covariate a vector with the same length as s2. \code{covariate} is used to regress
#' out the trend in \code{s2}. If \code{covariate = NULL}, it is ignored.
#' @param abundance_trend logical that decides if the additional abundance trend is fit
#' to the data. If `NULL` the abundance trend is fitted if there are more than 10 observations
#' and the `covariate` is not `NULL`. Default: `NULL`
#'
#' @return a list with three elements:
#' \describe{
#' \item{variance0}{estimate of the scale of the inverse Chisquared distribution. If
#' covariate is \code{NULL} a single number, otherwise a vector of \code{length(covariate)}}
#' \item{df0}{estimate of the degrees of freedom of the inverse Chisquared distribution}
#' \item{var_pos}{the shrunken variance estimates: a combination of \code{s2} and \code{variance0}}
#' }
#'
#' @seealso \code{limma::squeezeVar()}
#'
#' @keywords internal
variance_prior <- function(s2, df, covariate = NULL, abundance_trend = NULL){
stopifnot(length(s2) == length(df) || length(df) == 1)
stopifnot(is.null(covariate) || length(covariate) == length(s2))
if(is.null(covariate) && isTRUE(abundance_trend)) {
stop("If abundance_trend is true, covariate must not be 'NULL'.")
}
if(all(is.na(s2))){
# This happens if input has zero columns
return(list(variance0 = rep(NA, length(s2)), df0 = NA, var_post = s2))
}
s2_sub <- na.omit(s2)
if(all(s2_sub == 1)){
# Happens for example if overdispersion is fixed to 0 -> Poisson GLM
return(list(variance0 = rep(1, length(s2)), df0 = Inf, var_post = s2))
}
if(any(df <= 0, na.rm=TRUE)){
stop(paste0("All df must be positive. df ", paste0(which(df <= 0), collapse=", "), " are not."))
}
if(any(s2 <= 0, na.rm=TRUE)){
stop(paste0("All s2 must be positive. s2 ", paste0(which(s2 <= 0), collapse=", "), " are not."))
}
# Fit an intercept to s2
opt_res <- optim(par=c(log_variance0=0, log_df0_inv=0), function(par){
-sum(df(s2_sub/exp(par[1]), df1=df, df2=exp(par[2]), log=TRUE) - par[1], na.rm=TRUE)
})
variance0 <- rep(exp(unname(opt_res$par[1])), times = length(s2_sub))
df0 <- exp(unname(opt_res$par[2]))
if(is.numeric(abundance_trend)){
stop("Numeric abundance trend is not supported.")
}else if(is.null(covariate) || is.null(abundance_trend) || ! abundance_trend || length(s2_sub) < 10 || sum(covariate > 1e-8, na.rm = TRUE) < 10) {
if(length(s2_sub) < 10 && isTRUE(abundance_trend)){
warning("abundance_trend was set to 'TRUE', however there were not enough observations (length(s2) = ",
length(s2_sub), " < 10) to fit the trend.")
}
# Do nothing else to data
}else{
# Fit a trend!
# Fit a spline through the log(s2_sub) ~ covariate + 1 to account
# for remaing trends in s2_sub
covariate_sub <- covariate[! is.na(s2)]
log_covariate <- log(covariate_sub)
not_zero <- covariate_sub > 1e-8
ra <- range(log_covariate[not_zero])
knots <- ra[1] + c(1/3, 2/3) * (ra[2] - ra[1])
tryCatch({
design <- splines::ns(log_covariate[not_zero], df = 4, knots = knots, intercept = TRUE)
init_fit <- lm.fit(design, log(s2_sub[not_zero]))
opt_res <- optim(par=c(betas = init_fit$coefficients, log_df0_inv=0), function(par){
variance0 <- exp(design %*% par[1:4])
df0 <- exp(par[5])
-sum(df(s2_sub[not_zero]/variance0, df1=df, df2=df0, log=TRUE) - log(variance0), na.rm=TRUE)
}, control = list(maxit = 5000))
variance0[not_zero] <- c(exp(design %*% opt_res$par[1:4]))
df0 <- exp(unname(opt_res$par[5]))
}, error = function(e){
warning("Problem fitting the abundance trend to the quasi-likelihood dispersion. ",
"Falling back to non-trended variance0 estimation.")
})
}
if(opt_res$convergence != 0){
warning("Variance prior estimate did not properly converge\n")
}
var_post <- rep(NA, length(s2))
var_post[! is.na(s2)] <- (df0 * variance0 + df * s2_sub) / (df0 + df)
list(variance0 = variance0, df0 = df0, var_post = var_post)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.