#' Infer an informative prior for the precision
#'
#' \code{prior_precision} uses \code{DRIMSeq}'s pipeline to infer an informative prior for the
#' precision parameter of the Dirichlet-Multinomial distribution.
#' The function computes the genewise estimates for the precision via \code{DRIMSeq::dmPrecision},
#' and calculates the mean and standard deviation of the log-precision estimates.
#'
#' @param gene_to_transcript a matrix or data.frame with a list of gene-to-transcript correspondances.
#' The first column represents the gene id, while the second one contains the transcript id.
#' @param transcript_counts a matrix or data.frame, with 1 column per sample and 1 row per transcript,
#' containing the estimated abundances for each transcript in each sample.
#' @param n_cores the number of cores to parallelize the tasks on.
#' @param transcripts_to_keep a vector containing the list of transcripts to keep.
#' Ideally, created via \code{\link{filter_transcripts}}.
#' @param max_n_genes_used the maximum number of genes to compute the prior on.
#' First, genes with at least 2 transcripts are selected.
#' Then, if more than 'max_n_genes_used' such genes are available,
#' 'max_n_genes_used' of these genes are sampled at random and used
#' to calculate the prior of the precision parameter.
#' A smaller 'max_n_genes_used' (minimum 100) will lead to faster but more approximate prior estimates.
#'
#' @return A list with 2 objects containing:
#' \itemize{
#' \item prior: a vector containing the mean and standard deviation of the log-precision, used to formulate an informative prior in \code{\link{test_DTU}};
#' \item genewise_log_precision: a numeric vector with the individual genewise estimates for the log-precision.
#' }
#'
#' @examples
#' # specify the directory of the internal data:
#' data_dir = system.file("extdata", package = "BANDITS")
#'
#' # load gene_to_transcript matching:
#' data("gene_tr_id", package = "BANDITS")
#'
#' # Load the transcript level estimated counts via tximport:
#' library(tximport)
#' quant_files = file.path(data_dir, "STAR-salmon", paste0("sample", seq_len(4)), "quant.sf")
#' txi = tximport(files = quant_files, type = "salmon", txOut = TRUE)
#' counts = txi$counts
#'
#' # Optional (recommended): transcript pre-filtering
#' transcripts_to_keep = filter_transcripts(gene_to_transcript = gene_tr_id,
#' transcript_counts = counts,
#' min_transcript_proportion = 0.01,
#' min_transcript_counts = 10,
#' min_gene_counts = 20)
#'
#' # Infer an informative prior for the precision parameter
#' # Use the same filtering criteria as in 'create_data', by choosing the same argument for 'transcripts_to_keep'.
#' # If transcript pre-filtering is not performed, leave 'transcripts_to_keep' unspecified.
#' set.seed(61217)
#' precision = prior_precision(gene_to_transcript = gene_tr_id, transcript_counts = counts,
#' n_cores = 2, transcripts_to_keep = transcripts_to_keep)
#' precision$prior
#' head(precision$genewise_log_precision)
#'
#' @author Simone Tiberi \email{simone.tiberi@uzh.ch}
#'
#' @seealso \code{\link{test_DTU}}, \code{\link{plot_precision}}
#'
#' @export
prior_precision = function(gene_to_transcript,
transcript_counts,
n_cores = 1,
transcripts_to_keep = NULL,
max_n_genes_used = 100){
# check that gene_to_transcript is a matrix or data.frame object
if( !is.data.frame(gene_to_transcript) & !is.matrix(gene_to_transcript) ){
message("'gene_to_transcript' must be a matrix or data.frame")
return(NULL)
}
if( ncol(gene_to_transcript) != 2 ){
message("'gene_to_transcript' must be a 2 column matrix or data.frame")
return(NULL)
}
# check that transcript_counts is a matrix or data.frame object
if( !is.data.frame(transcript_counts) & !is.matrix(transcript_counts) ){
message("'transcript_counts' must be a matrix or data.frame")
return(NULL)
}
if( !all( rownames(transcript_counts) %in% gene_to_transcript[,2]) ){
message("All transcript names in 'rownames(transcript_counts)' must be in 'gene_to_transcript[,2]'")
return(NULL)
}
if( !all(dim(transcript_counts) > 0.5) ){
message("'transcript_counts' must have at least 1 row (transcripts) and 1 column (samples)")
return(NULL)
}
if(max_n_genes_used < 100){
message("'max_n_genes_used' must be at least 100")
return(NULL)
}
# filter transcripts if 'transcripts_to_keep' is provided:
if(!is.null(transcripts_to_keep)){
sel = rownames(transcript_counts) %in% transcripts_to_keep
if(sum(sel) == 0){
message("0 transcripts from 'transcripts_to_keep' were found in 'rownames(transcript_counts)'")
return(NULL)
}
transcript_counts = transcript_counts[sel,]
}
N = ncol(transcript_counts)
matches = match( rownames(transcript_counts), gene_to_transcript[,2] )
gene_id = as.character( gene_to_transcript[matches,1] )
colnames(transcript_counts) = paste("sample", seq_len(N))
counts_df = data.frame(transcript_counts, gene_id = gene_id, feature_id = rownames(transcript_counts))
# remove genes with 1 transcript only:
tab = table(counts_df$gene_id)
sel_genes = names(tab)[tab > 1.5]
counts_df = counts_df[ counts_df$gene_id %in% sel_genes, ]
all_genes = unique(counts_df$gene_id)
# to speed-up prior calculations, use a subset of the genes only to compute the prior:
if( length(all_genes) > max_n_genes_used){
sel_genes_random = sample(all_genes, max_n_genes_used, replace = FALSE)
counts_df = counts_df[ counts_df$gene_id %in% sel_genes_random, ]
}
# maybe design and samples not needed for precision estimate...check the function internally!
samples = data.frame(sample_id = colnames(counts_df)[seq_len(N)],
group = "A")
d = dmDSdata(counts = counts_df, samples = samples)
design = model.matrix(~ 1, data = samples(d))
message("Estimating gene-wise precision parameters")
suppressMessages({
d = dmPrecision(d, genewise_precision = TRUE,
design = design, BPPARAM = MulticoreParam(workers = n_cores))
})
message("Estimation completed")
log_prec = log( genewise_precision(d)$genewise_precision )
mean_log_precision = mean(log_prec, na.rm = TRUE)
sd_log_precision = sd(log_prec, na.rm = TRUE)
names(log_prec) = genewise_precision(d)$gene_id
list( prior = c(mean_log_precision, sd_log_precision), genewise_log_precision = log_prec )
}
#' Plot the log-precision estimates
#'
#' \code{\link{plot_precision}} plots a histogram of the estimates for the log-precision parameter
#' of the Dirichlet-Multinomial distribution, obtained via \code{\link{prior_precision}}.
#' The solid line represents the normal prior for the log-precision parameter.
#'
#' @param prior the prior of the log-precision parameter, computed via \code{\link{prior_precision}}.
#'
#' @return A plot.
#'
#' @examples
#' # load the pre-computed precision estimates:
#' data(precision, package = "BANDITS")
#'
#' # Plot the histogram of the genewise log-precision estimates.
#' # The black solid line represents the normally distributed prior distribution
#' # for the log-precision parameter.
#' plot_precision(precision)
#'
#' @author Simone Tiberi \email{simone.tiberi@uzh.ch}
#'
#' @seealso \code{\link{test_DTU}}, \code{\link{prior_precision}}
#'
#' @export
plot_precision = function(prior){
if( !is.list(prior) ){
message("'prior' must be a list of length 2, created via 'prior_precision'")
return(NULL)
}
if( length(prior) != 2 ){
message("'prior' must be a list of length 2, created via 'prior_precision'")
return(NULL)
}
hist( prior[[2]], main = "Log-prior precision estimates",
xlab = "log-prior", freq = FALSE)
curve(dnorm(x, prior[[1]][1], prior[[1]][2]), add = TRUE, lwd = 3)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.