Nothing
#' Fit mixture model
#'
#' Fit a mixture model to estimate mosaicism and XCI-escape.
#'
#' @param genic_dt A \code{data.table}. The table as outputted by \code{getGenicDP}.
#' @param model A \code{character} indicating which model to use to estimate
#' the mosaicism. Valid choices are "AUTO", "BB", "MM", "MM2", "MM3". See details.
#' @param plot A \code{logical}. If set to TRUE, information about the training
#' set and the skewing estimate will be plotted.
#' @param hist A \code{logical}. If set to TRUE, an histogram of the skewing
#' estimates will be displayed.
#' @param flag A \code{numeric}. Specify how to handle convergence issues. See
#' details.
#' @param xciGenes A \code{character} or NULL. To be passed to \code{readXCI} to
#' select the training set of inactivated genes.
#' @param a0 A \code{numeric} or NULL. Starting values for the optimization. This
#' should not be used with more than one model as different models have
#' different parameters. Leave NULL unless you know what you're doing.
#' @param optimizer A \code{character}. The optimization function to use for minimization
#' of the log-likelihood. Should be one of "nlminb" or "optim".
#' @param method A \code{character}. The method to be passed to \code{optim}
#' when it is the selected \code{optimizer}.
#' @param limits A \code{logical}. If set to TRUE, the optimization will be
#' constrained. Using upper bounds on the probability of sequencing error and
#' escape in the training set ensures that the dominant mixture represents the
#' skewing for inactivated genes.
#' @param debug A \code{logical}. If set to TRUE, information about each iteration
#' will be printed (Useful to identify problematic samples).
#'
#'
#' @details
#' The model determines the number of components used in the mixture model. By
#' default, "AUTO" tries all combinations of mixtures and the best estimate is
#' kept using backward selection based on AIC.
#' BB is a simple beta-binomial. MM adds a binomial component to model the
#' sequencing errors. MM2 jointly models the probability of misclasification
#' in the training set. MM3 include all 3 components.
#'
#' Flags in the output reports issues in convergence. If \code{flag} is set to 0,
#' nothing is done. If set to 1, the model selection will avoid flagged models
#' (will favor parcimonious models).
#' If set to 2, calls for which the best selected model had convergence issue
#' will be removed.
#'
#' @return A \code{data.table} with an entry per sample and per gene.
#'
#' @example inst/examples/betaBinomXI.R
#'
#' @seealso getGenicDP readXCI
#' @export
betaBinomXI <- function(genic_dt, model = "AUTO", plot = FALSE, hist = FALSE,
flag = 0, xciGenes = NULL, a0 = NULL,
optimizer = c("nlminb", "optim"), method = NULL, limits = TRUE,
debug = FALSE){
dt <- copy(genic_dt)
dt[, dp1 := pmin(AD_hap1, AD_hap2)]
dt[, tot := AD_hap1 + AD_hap2]
# Use XCI to estimate parameters
#if(xci > 0){
xcig <- readXCI(xciGenes)
inactivated_genes <- xcig
dt_xci <- dt[GENE %in% inactivated_genes]
if(nrow(dt_xci) == 0){
warning("No known silenced gene found in data.")
}
optimizer <- match.arg(optimizer)
model <- .check_model(model)
modl <- vector("list", length(model))
for(i in seq_along(modl)){
modi <- model[i]
if(modi == "BB"){
dt <- BB(dt_xci, dt, a0, optimizer, method, limits, debug)
} else if(modi == "MM"){
dt <- MM(dt_xci, dt, a0, optimizer, method, limits, debug)
} else if(modi == "MM2"){
dt <- MM2(dt_xci, dt, a0, optimizer, method, limits, debug)
} else if(modi == "MM3"){
dt <- MM3(dt_xci, dt, a0, optimizer, method, limits, debug)
}
modl[[i]] <- dt
}
if(length(modl) > 1){
dt <- .back_sel(modl, flag = flag)
} else{
dt <- modl[[1]]
}
dt[, f := a_est/(a_est + b_est)]
dt[, fg := dp1/tot]
# Use the estimated a and b to calculate var_fg and the test statistic
# Under H0, f_g ~ BB
dt[, var_fg := (tot * a_est * b_est * (a_est + b_est + tot))]
dt[, var_fg := var_fg/((a_est + b_est)^2 * (a_est + b_est + 1))]
dt[, var_fg := var_fg/tot^2] # Because we want the variance for the fraction, not the variance for the counts
dt[, t := (fg-f)/sqrt(var_fg)] #Test statistic
# dt[, pbb := pbb(dp1, tot, a_est, b_est, type = "midp"), by = c("GENE", "sample")]
dt[, pbb := .pbb_midp(dp1, tot, a_est, b_est), by = c("GENE", "sample")]
dt[, p_value := pnorm(t, lower.tail = FALSE)]
#tau is the Xi expression
dt[, tau := (f-fg)/(2*f-1)]
# tau is a fraction (0<tau<1) but estimate can be < 0 if fg < f
# Which could only happens if a gene is tightly inactivated
dt[, tau := ifelse(tau < 0, 0, tau)]
dt[, var_tau := (2*f-1)^-2 * var_fg]
dt[, ivw_tau := tau/sqrt(var_tau)]
if(hist){
hist(unique(dt[, f]), breaks = seq(0, .5, .01), main = "Cell fraction", xlab="Cell fraction", ylab = "samples")
}
if(plot){
plotBBCellFrac(xci_dt = dt[GENE %in% xcig])
}
return(dt[])
}
.check_model <- function(model){
model <- unique(toupper(model))
validmodels <- c("BB", "MM", "MM2", "MM3")
if("AUTO" %in% model){
model <- validmodels
}
if(any(!model %in% validmodels)){
err <- paste("Invalid models:", paste(model[!model %in% validmodels], collapse=", "),
"should be one of", paste(validmodels, collapse=", "))
stop(err)
}
return(model)
}
################################################################################
dbb <- function(x, n, a, b){#Beta binomial
#y <- choose(n, x) * beta(x+a, n-x+b)/beta(a,b)
c <- choose(n, x)
bn <- beta(x+a, n-x+b)
bd <- beta(a, b)
y <- c * bn / bd
ninf <- sum(is.infinite(y))
#if(ninf > 0){
# warning(paste("dbb returns",ninf, "infinite values"))
#}
y <- y[is.finite(y)]
return(y);
}
# Calculate log likelihood directly to avoid computation overflow
ldbb <- function(x, n, a, b){
lc <- lchoose(n, x)
lbn <- lbeta(x+a, n-x+b)
lbd <- lbeta(a, b)
ly <- lc + lbn - lbd
ninf <- sum(is.infinite(ly))
if(ninf > 0){
warning(paste("ldbb returns",ninf, "infinite values"))
}
ly <- ly[is.finite(ly)]
return(ly)
}
## P-values for exact inference
#pbb <- function(x, n, a, b, type = "midp"){ #p-value for exact inference
# if(is.na(x) | is.na(n)){
# return(as.numeric(NA))
# }
# if(x >= n)
# return(0)
# if(type == "gt"){
# from <- x+1
# sump <- sum(dbb(from:n, n, a, b), na.rm = TRUE)
# } else if(type == "geq"){
# from <- x
# sump <- sum(dbb(from:n, n, a, b), na.rm = TRUE)
# } else if(type == "midp"){
# from <- x+1
# t0 <- dbb(x, n, a, b)
# t1 <- sum(dbb(from:n, n, a, b), na.rm = TRUE)
# sump <- .5*t0 + t1
# } else{
# stop("Type should be one of 'gt', 'geq', 'midp'")
# }
# return(sump)
#}
.pbb_midp <- function(x, n, a, b){
if(is.na(x) | is.na(n)){
return(as.numeric(NA))
}
if(x >= n)
return(0)
from <- x+1
t0 <- exp(ldbb(x, n, a, b))
t1 <- sum(exp(ldbb(from:n, n, a, b)), na.rm = TRUE)
sump <- .5*t0 + t1
return(sump)
}
################################################################################
# Beta-binomial model:
BB <- function(dt_xci, full_dt, a0 = NULL, optimizer = "nlminb", method = NULL,
limits = FALSE, debug = FALSE){
dt <- copy(full_dt)
samples <- unique(dt_xci$sample)
if(is.null(a0)){
a0 <- c(1,1)
}
for(sample_i in samples){
if(debug)
print(sample_i)
dp1 <- dt_xci[sample == sample_i, dp1]
dp <- dt_xci[sample == sample_i, tot]
Nxcig <- dt_xci[sample == sample_i, .N]
if(limits){
lowb <- c(0, 0)
upb <- c(Inf, Inf)
upb <- c(699, 700)
method <- "L-BFGS-B"
} else{
lowb <- -Inf
upb <- Inf
}
if(optimizer == "nlminb"){
res_optim <- nlminb(a0, .logL_BB, dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "objective"
} else if(optimizer == "optim"){
res_optim <- optim(par = a0, fn = .logL_BB, method = method,
dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "value"
} else{
stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
}
message <- res_optim$message
message <- ifelse(is.null(message), "", message)
dt[sample == sample_i, a_est := exp(res_optim$par[1]) + 1]
dt[sample == sample_i, b_est := exp(res_optim$par[2]) + 1]
dt[sample == sample_i, model := "BB"]
dt[sample == sample_i, k := length(res_optim$par)]
dt[sample == sample_i, logL := res_optim[[negLogLname]]] #-logL
dt[sample == sample_i, convergence := res_optim$convergence]
dt[sample == sample_i, flag := message]
dt[sample == sample_i, AIC := 2*k + 2*logL]
dt[sample == sample_i, Ntrain := Nxcig]
dt[sample == sample_i, BIC := log(Ntrain)*k + 2*logL]
}
return(dt)
}
.logL_BB <- function(a, dp1, dp){
a1 <- exp(a[1]) + 1
b1 <- exp(a[2]) + 1
logLbb <- sum(ldbb(dp1,dp,a1,b1)*(-1), na.rm = TRUE)
return(logLbb)
}
################################################################################
# Mixture models:
# 1 Binom for sequencing errors and 1 BB for inactivated heterozygous SNP
MM <- function(dt_xci, full_dt, a0 = NULL, optimizer = "nlminb", method = NULL,
limits = FALSE, debug = FALSE){
dt <- copy(full_dt)
samples <- unique(dt_xci$sample)
if(is.null(a0)){
a0 <- c(1, 1, log(0.98/0.02), log(0.02/0.98))
}
for(sample_i in samples){
if(debug)
print(sample_i)
dp1 <- dt_xci[sample == sample_i, dp1]
dp <- dt_xci[sample == sample_i, tot]
Nxcig <- dt_xci[sample == sample_i, .N]
if(limits){
lowb <- c(0, 0, 0, -Inf)
upb <- c(Inf, Inf, Inf, log(0.2/0.8))
method <- "L-BFGS-B"
} else{
lowb <- -Inf
upb <- Inf
}
if(optimizer == "nlminb"){
res_optim <- nlminb(a0, .logL_MM, dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "objective"
} else if(optimizer == "optim"){
res_optim <- optim(par = a0, fn = .logL_MM, method = method,
dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "value"
} else{
stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
}
message <- res_optim$message
message <- ifelse(is.null(message), "", message)
dt[sample == sample_i, a_est := exp(res_optim$par[1]) + 1]
dt[sample == sample_i, b_est := exp(res_optim$par[2]) + 1]
dt[sample == sample_i, p_het := exp(res_optim$par[3])/(1 + exp(res_optim$par[3]))]
dt[sample == sample_i, pi_err := exp(res_optim$par[4])/(1 + exp(res_optim$par[4]))]
dt[sample == sample_i, model := "MM"]
dt[sample == sample_i, k := length(res_optim$par)]
dt[sample == sample_i, logL := res_optim[[negLogLname]]] #Actually -logL
dt[sample == sample_i, convergence := res_optim$convergence]
dt[sample == sample_i, flag := message]
dt[sample == sample_i, AIC := 2*k + 2*logL]
dt[sample == sample_i, Ntrain := Nxcig]
}
return(dt)
}
.logL_MM <- function(a, dp1, dp) {
a1 <- exp(a[1]) +1
b1 <- exp(a[2]) +1
# Proportions have to be kept as an exponent ratio to avoid issues with boundaries
p_het <- exp(a[3])/(1+exp(a[3])); # Proba that the SNP is indeed bi-allelic (initial proba = .98)
pi_err <- 0.001+0.999*exp(a[4])/(1+exp(a[4])); # (initial proba = .02)
pb <- dbinom(dp1, dp, pi_err)
# Operations on the log scale to avoid infinite values
lpbb <- ldbb(dp1, dp, a1, b1)
p_tot <- p_het * exp(lpbb) + (1-p_het)*pb
logL <- -sum(log(p_tot))
return(logL);
}
# 1 BB for inactivated SNP and 1 BB for escaped SNP
MM2 <- function(dt_xci, full_dt, a0 = NULL, optimizer = "nlminb", method = NULL,
limits = FALSE, roundmax = TRUE, debug = FALSE){
dt <- copy(full_dt)
samples <- unique(dt_xci$sample)
if(is.null(a0)){
a0 <- c(1, 1, #Beta for the inactivated genes mean = a/(a+b) = .5
2, 2, #Beta for the escape #2 has more density around 0.5
log(0.9/0.1)) #Initial proba that a training gene is indeed inactivated in this sample
}
for(sample_i in samples){
if(debug)
print(sample_i)
dp1 <- dt_xci[sample == sample_i, dp1]
dp <- dt_xci[sample == sample_i, tot]
Nxcig <- dt_xci[sample == sample_i, .N]
err <- try({
if(limits){
# ai,bi,ae,be > .5 to have a mode
# p_inac > log(0.75/0.25) 3/4 of the training genes should be inactivated
# res_optim <- nlminb(a0, .logL_MM2, dp1 = dp1, dp = dp,
# lower = c(0, 0, -Inf, -Inf, log(0.75/0.25)), # 0 mins min(p_inac) > .5
# upper = c(Inf, Inf, Inf, Inf, Inf))
lowb <- c(0, 0, -Inf, -Inf, log(0.75/0.25))
upb <- c(Inf, Inf, Inf, Inf, Inf)
method <- "L-BFGS-B"
} else{
# res_optim <- nlminb(a0, .logL_MM2, dp1 = dp1, dp = dp)
lowb <- -Inf
upb <- Inf
}
if(optimizer == "nlminb"){
res_optim <- nlminb(a0, .logL_MM2, dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "objective"
} else if(optimizer == "optim"){
res_optim <- optim(par = a0, fn = .logL_MM2, method = method,
dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "value"
} else{
stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
}
message <- res_optim$message
message <- ifelse(is.null(message), "", message)
#p_inac_i <- exp(res_optim$par[5])/(1 + exp(res_optim$par[5]))
expar5 <- exp(res_optim$par[5])
if(is.finite(expar5)){
p_inac_i <- expar5/(1 + expar5)
} else if(expar5 == Inf){
p_inac_i <- 1
} else{
stop("Something went wrong with the estimation of the inactivated mixture proportion")
}
# Selecting the right component to get alpha/beta corresponding to the inactivated group
a_est_i <- exp(res_optim$par[1]) +1
b_est_i <- exp(res_optim$par[2]) +1
a_est_e <- exp(res_optim$par[3]) +1
b_est_e <- exp(res_optim$par[4]) +1
flagi <- message
fi <- a_est_i/(a_est_i + b_est_i)
fe <- a_est_e/(a_est_e + b_est_e)
# Exponentiated differences
if(roundmax){ #If we hit maximum numeric
if(is.infinite(a_est_e))
a_est_e <- .Machine$double.xmax
if(is.infinite(b_est_e))
b_est_e <- .Machine$double.xmax
if(is.infinite(fe))
fe <- 0.5
}
if(!is.finite(fi) | !is.finite(fe)){
stop(paste("For sample", sample_i, "fi is", fi, "and fe is", fe))
} else if(fi > fe){
flagi <- "MM2: fi>fe"
}
})
if(inherits(err, "try-error")){
dt[sample == sample_i, a_est := NA]
dt[sample == sample_i, b_est := NA]
dt[sample == sample_i, p_inac := NA]
dt[sample == sample_i, pi_escape := NA]
dt[sample == sample_i, model := "MM2"]
dt[sample == sample_i, k := NA]
dt[sample == sample_i, logL := NA] #Actually -logL
dt[sample == sample_i, convergence := NA]
dt[sample == sample_i, flag := "MM2: Error"]
dt[sample == sample_i, AIC := NA]
dt[sample == sample_i, Ntrain := Nxcig]
} else{
dt[sample == sample_i, a_est := a_est_i]
dt[sample == sample_i, b_est := b_est_i]
dt[sample == sample_i, p_inac := p_inac_i]
dt[sample == sample_i, pi_escape := a_est_e/(a_est_e + b_est_e)]
dt[sample == sample_i, model := "MM2"]
dt[sample == sample_i, k := length(res_optim$par)]
dt[sample == sample_i, logL := res_optim[[negLogLname]]] #Actually -logL
dt[sample == sample_i, convergence := res_optim$convergence]
dt[sample == sample_i, flag := flagi]
dt[sample == sample_i, AIC := 2*k + 2*logL]
dt[sample == sample_i, Ntrain := Nxcig]
}
}
return(dt)
}
.logL_MM2 <- function(a, dp1, dp) {
# Inactivated gene from XCI
ai <- exp(a[1]) + 1
bi <- exp(a[2]) + 1
# Escape gene in XCI list
ae <- exp(a[3]) + 1
be <- exp(a[4]) + 1
p_inac <- exp(a[5])/(1+exp(a[5])) #Proba that the gene is indeed inactivated (0.9)
lpbbi <- ldbb(dp1, dp, ai, bi)
lpbbe <- ldbb(dp1, dp, ae, be)
p_tot <- p_inac * exp(lpbbi) + (1-p_inac) * exp(lpbbe)
logL <- -sum(log(p_tot))
return(logL);
}
# 1 BB for inac SNPs 1 BB for escaped SNP and 1 Binom for sequencing err
MM3 <- function(dt_xci, full_dt, a0 = NULL, optimizer ="nlminb", method = NULL,
limits = FALSE, debug = FALSE){
dt <- copy(full_dt)
samples <- unique(dt_xci$sample)
if(is.null(a0)){
a0 <- c(1, 1, #Beta for the inactivated genes mean = a/(a+b) = .5
2, 2, #Beta for the escape #2 has more density around 0.5
log(0.9/0.1), #Initial proba that a training gene is indeed inactivated in this sample
log(0.98/0.02), #Proba of SNP being actually het
log(0.02/0.98))
}
for(sample_i in samples){
if(debug)
print(sample_i)
dp1 <- dt_xci[sample == sample_i, dp1]
dp <- dt_xci[sample == sample_i, tot]
Nxcig <- dt_xci[sample == sample_i, .N]
err <- try({
if(limits){
# res_optim <- nlminb(a0, .logL_MM3, dp1 = dp1, dp = dp,
# lower = c(0, 0, -Inf, -Inf, 0, 0, -Inf),
# upper = c(Inf, Inf, Inf, Inf, Inf, Inf, log(0.2/0.8)))
lowb <- c(0, 0, -Inf, -Inf, 0, 0, -Inf)
upb <- c(Inf, Inf, Inf, Inf, Inf, Inf, log(0.2/0.8))
method <- "L-BFGS-B"
} else{
lowb <- -Inf
upb <- Inf
# res_optim <- nlminb(a0, .logL_MM3, dp1 = dp1, dp = dp)
}
if(optimizer == "nlminb"){
res_optim <- nlminb(a0, .logL_MM3, dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "objective"
} else if(optimizer == "optim"){
res_optim <- optim(par = a0, fn = .logL_MM3, method = method,
dp1 = dp1, dp = dp,
lower = lowb, upper = upb)
negLogLname <- "value"
} else{
stop("Unknown optimizer. Should be one of 'nlminb', 'optim'")
}
message <- res_optim$message
message <- ifelse(is.null(message), "", message)
expar5 <- exp(res_optim$par[5])
if(is.finite(expar5)){
p_inac_i <- expar5/(1 + expar5)
} else if(expar5 == Inf){
p_inac_i <- 1
} else{
stop("Something went wrong with the estimation of the inactivated mixture")
}
# if(p_inac_i >= .5){ # We assume that the list should always have a majority of inactivated genes
# a_est_i <- exp(res_optim$par[1])
# b_est_i <- exp(res_optim$par[2])
# a_est_e <- exp(res_optim$par[3])
# b_est_e <- exp(res_optim$par[4])
# } else{
# p_inac_i <- 1-p_inac_i
# a_est_i <- exp(res_optim$par[3])
# b_est_i <- exp(res_optim$par[4])
# a_est_e <- exp(res_optim$par[1])
# b_est_e <- exp(res_optim$par[2])
# }
a_est_i <- exp(res_optim$par[1]) + 1
b_est_i <- exp(res_optim$par[2]) + 1
a_est_e <- exp(res_optim$par[3]) + 1
b_est_e <- exp(res_optim$par[4]) + 1
flagi <- message
fi <- a_est_i/(a_est_i + b_est_i)
fe <- a_est_e/(a_est_e + b_est_e)
ferr_i <- exp(res_optim$par[7])/(1 + exp(res_optim$par[7]))
if(fi > fe){
flagi <- "MM3: fi>fe"
} else if(fi < ferr_i){
flagi <- "MM3: ferr>fi"
}
})
if(inherits(err, "try-error")){
dt[sample == sample_i, a_est := NA]
dt[sample == sample_i, b_est := NA]
dt[sample == sample_i, p_inac := NA]
dt[sample == sample_i, p_het := NA]
dt[sample == sample_i, pi_escape := NA]
dt[sample == sample_i, model := "MM3"]
dt[sample == sample_i, k := NA]
dt[sample == sample_i, logL := NA] #Actually -logL
dt[sample == sample_i, convergence := NA]
dt[sample == sample_i, flag := "MM3: Error"]
dt[sample == sample_i, AIC := NA]
dt[sample == sample_i, Ntrain := Nxcig]
} else{
dt[sample == sample_i, a_est := a_est_i]
dt[sample == sample_i, b_est := b_est_i]
dt[sample == sample_i, p_inac := p_inac_i]
dt[sample == sample_i, p_het := exp(res_optim$par[6])/(1+exp(res_optim$par[6]))]
dt[sample == sample_i, pi_escape := a_est_e/(a_est_e + b_est_e)]
dt[sample == sample_i, pi_err := ferr_i]
dt[sample == sample_i, model := "MM3"]
dt[sample == sample_i, k := length(res_optim$par)]
# dt[sample == sample_i, logL := res_optim$objective] #Actually -logL
dt[sample == sample_i, logL := res_optim[[negLogLname]]] #Actually -logL
dt[sample == sample_i, convergence := res_optim$convergence]
dt[sample == sample_i, flag := flagi]
dt[sample == sample_i, AIC := 2*k + 2*logL]
dt[sample == sample_i, Ntrain := Nxcig]
}
}
return(dt)
}
.logL_MM3 <- function(a, dp1, dp){
# Inactivated gene from XCI
ai <- exp(a[1]) + 1
bi <- exp(a[2]) + 1
# Escape gene in XCI list
ae <- exp(a[3]) + 1
be <- exp(a[4]) + 1
if(is.finite(exp(a[5]))){
p_inac <- exp(a[5])/(1+exp(a[5])) #Proba that the gene is not a training set error
} else if(exp(a[5]) == Inf){
p_inac <- 1 #Tends to 1
} else{
stop("Something is wrong with the a[5]")
}
if(is.finite(exp(a[6]))){
p_het <- exp(a[6])/(1+exp(a[6])) #Proba that the gene is indeed heterozygous
} else if(exp(a[6]) == Inf){
p_het <- 1 #Tends to 1
} else{
stop("Something is wrong with the a[6]")
}
pi_err <- .01 + .99*exp(a[7])/(1+exp(a[7]))
lpbb_i <- ldbb(dp1, dp, ai, bi)
lpbb_e <- ldbb(dp1, dp, ae, be)
pb_err <- dbinom(dp1, dp, pi_err)
## TODO: Handle cases where exp lpbb_i/lpbb_e > 0
## lpbb should be between 0 and 1
p_tot <- p_het * (p_inac * exp(lpbb_i) + # Reads from component1
(1-p_inac) * exp(lpbb_e)) + # Reads from component2
(1-p_het) * pb_err # Reads from sequencing error
logL <- -sum(log(p_tot))
return(logL)
}
################################################################################
# Model selection
.back_sel <- function(modl, criterion = "AIC", flag = 0){
cols <- Reduce(intersect, lapply(modl, names))
modl <- lapply(modl, function(XX){XX[, cols, with = FALSE]})
aics <- rbindlist(modl)
if(flag == 1){# Models where the sample had errors are discarded
aics <- aics[!grep("^MM", flag)]
}
aics <- aics[aics[, .I[which.min(AIC)], by = "sample,GENE"]$V1] #Model that minimizes AIC
if(flag == 2){# Remove samples where the selected model had errors
aics <- aics[!grep("^MM", flag)]
}
return(aics)
}
################################################################################
# Can also be used to visualize the cell frac from the calls
#' Plot cell fraction estimates
#'
#' Plot cell fraction estimates from list of known XCI genes
#'
#' @param xci_dt A \code{data.table}. The data to be used for the estimate
#' of skewing (i.e: limited to XCI genes).
#' @param xcig A \code{logical}. If \code{xci_dt} was not subset for training
#' genes only, setting xcig to TRUE will filter the data.
#' @param gene_names A \code{character}. If left blank, only genes that are
#' further than 20% away from the estimated skewing will be annotated, if set
#' to "all", all genes will be named. Set to "none" to remove all annotations.
#' Alternately, a \code{character} vector can be passed to annotate specific
#' genes of interest.
#' @param color_col A \code{character}. One of the columns of \code{xci_dt} can
#' be used to color genes.
#' @param xist A \code{logical}. Set to TRUE to display XIST in addition to
#' the training genes.
#'
#' @return NULL
#'
#' @details
#' This function is mostly used in \code{betaBinomXI} to ensure that the cell
#' fraction is estimated properly. However, it can be used from the output
#' of \code{betaBinomXI} to troubleshoot estimation issues.
#'
#' @return The plot object in class \code{ggplot}.
#'
#' @importFrom ggplot2 element_blank scale_colour_manual facet_wrap
#' @importFrom ggplot2 geom_point geom_text geom_hline theme_bw
plotBBCellFrac <- function(xci_dt, xcig = NULL, gene_names = "",
color_col = NULL, xist = TRUE){
plotfrac <- xci_dt[order(sample)]
if(!is.null(xcig)){
xcig <- readXCI(xcig)
plotfrac <- plotfrac[GENE %in% xcig]
}
Nt <- plotfrac[, .N, by = "sample"]
plotfrac <- merge(plotfrac, Nt, by = "sample")
plotfrac[, index := unlist(sapply(Nt[, N], function(x){seq_len(x)}))]
plotfrac[, label := ""]
if(length(gene_names) > 1){
plotfrac[GENE %in% gene_names, label := GENE]
} else if(gene_names == "all"){
plotfrac[, label := GENE]
} else if(gene_names == "none"){
plotfrac[, label := ""]
} else if(gene_names == ""){
plotfrac[abs(fg - f) > .2, label := GENE]
} else{
plotfrac[GENE %in% gene_names, label := GENE]
}
if("model" %in% colnames(plotfrac)){
gp <- geom_point(aes(shape = model))
} else{
gp <- geom_point()
}
if(is.null(color_col)){
p <- ggplot(plotfrac, aes(x = index, y = fg))
} else{
p <- ggplot(plotfrac, aes(x = index, y = fg, color = plotfrac[[color_col]]))
}
## TODO: Add the other components when available (also add option to return components from all models to betaBinomXI)
p <- p + geom_hline(aes(yintercept = f))
p <- p + gp + geom_text(aes(label = label))
#p <- p + geom_text(aes(x = N - 5, y= max(fg) + .01, label = paste("N =", N)))
p <- p + geom_text(aes(x = 0, y= max(fg) + .01, label = paste("N =", N)), hjust = "left")
p <- p + facet_wrap(~sample, scales = "free_x")
p <- p + theme_bw() + theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank())
if(xist){
xists <- xci_dt[GENE == "XIST"]
p <- p + geom_point(data = xists, aes(x = Ntrain/2, y = fg), color = "red", shape = 4, size = 4)
}
print(p)
return(invisible(p))
}
#' Plot QC
#'
#' This plot shows QC for skewing estimates
#'
#' @param xci_table A \code{data.table}. Data to plot. Should be the results of
#' \code{betaBinomXI}, \code{getGenicDP} or one of the annotation functions.
#' @param xcig A \code{character} vector. The names of the genes in the
#' inactivated training set.
#' @param gene_names A \code{character}. If left blank, only genes that are
#' further than 20% away from the estimated skewing will be annotated, if set
#' to "all", all genes will be named. Set to "none" to remove all annotations.
#' Alternately, a \code{character} vector can be passed to annotate specific
#' genes of interest.
#'
#' @return An invisible plot object.
#'
#' @importFrom ggplot2 ggplot aes theme theme_bw element_blank facet_wrap
#' geom_point geom_hline geom_text
#'
#' @example inst/examples/betaBinomXI.R
#'
#' @export
plotQC <- function(xci_table, xcig = NULL, gene_names = ""){
sd_f <- f_sd_up <- f_sd_lo <- meanf <- NULL
# TODO: Order the genes along X OR by their allelic ratio
# TODO: Add confint based on the distribution: https://stats.stackexchange.com/questions/82475/calculate-the-confidence-interval-for-the-mean-of-a-beta-distribution
# TODO: Add the gene names
plottable <- xci_table[order(sample)]
plottable[, index := rowid(plottable$sample)]
plottable[, set := "Test"]
plottable[GENE %in% xcig, set := "Training"]
plottable[GENE %in% xcig, meanf := mean(fg), by = "sample"]
# Add standard deviation of estimates
plottable[, sd_f := sqrt((a_est + b_est)/((a_est + b_est)^2 * (a_est + b_est + 1)))]
plottable[, f_sd_up := pmin(0.5, f + sd_f)]
plottable[, f_sd_lo := pmax(0, f - sd_f)]
xists <- plottable[GENE == "XIST"]
# Compute index
p <- ggplot(plottable, aes(x = index, y = fg))
# Plot the skewing
p <- p + geom_hline(aes(yintercept = f))
p <- p + geom_hline(aes(yintercept = f_sd_up), lty = "dashed")
p <- p + geom_hline(aes(yintercept = f_sd_lo), lty = "dashed")
# Plot basic mean for comparison
p <- p + geom_hline(aes(yintercept = meanf), color = "blue", lty = "dotted")
p <- p + geom_text(aes(0,meanf,label = "mean", hjust = -1), colour = "blue")
# Plot the allelic ratios
p <- p + geom_point(aes(shape = model, color = set))
# Handle gene names
# Color by readcount
# Add theme
theme_QC <- theme_bw() + theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank())
p <- p + facet_wrap(~sample, scales = "free_x") + theme_QC
# Plot XIST
p <- p + geom_point(data = xists, aes(x = Ntrain/2, y = fg), color = "red", shape = 4, size = 4)
# Add training gene count
p <- p + geom_text(aes(x = 0, y= max(fg) + .01, label = paste("N =", Ntrain)), hjust = "left")
suppressWarnings(print(p))
return(invisible(p))
}
#TODO: A QC plot based on the annotated dataframe instead of the gene summary.
# It requires keeping track of all SNPs (instead of the sums or just top SNPs)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.